diff options
597 files changed, 17210 insertions, 10955 deletions
@@ -50,14 +50,14 @@ build:remote --extra_toolchains=//tools/bazeldefs:cc-toolchain-clang-x86_64-defa build:remote --extra_execution_platforms=//tools/bazeldefs:rbe_ubuntu1604 build:remote --platforms=//tools/bazeldefs:rbe_ubuntu1604 build:remote --crosstool_top=@rbe_default//cc:toolchain -build:remote --jobs=50 +build:remote --jobs=100 build:remote --remote_timeout=3600 build:remote3 --host_platform=//tools/bazeldefs:rbe_ubuntu1604_bazel3 build:remote3 --extra_toolchains=//tools/bazeldefs:cc-toolchain-clang-x86_64-default_bazel3 build:remote3 --extra_execution_platforms=//tools/bazeldefs:rbe_ubuntu1604_bazel3 build:remote3 --platforms=//tools/bazeldefs:rbe_ubuntu1604_bazel3 build:remote3 --crosstool_top=@rbe_default//cc:toolchain -build:remote3 --jobs=50 +build:remote3 --jobs=100 build:remote3 --remote_timeout=3600 # Set flags for uploading to BES in order to view results in the Bazel Build diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index a81d06cca..4da3853b2 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -14,10 +14,11 @@ jobs: - id: setup run: | if ! [[ -z "${{ secrets.GO_TOKEN }}" ]]; then - echo ::set-output has_token=true + echo ::set-output name=has_token::true else - echo ::set-output has_token=false + echo ::set-output name=has_token::false fi + - run: | jq -nc '{"state": "pending", "context": "go tests"}' | \ curl -sL -X POST -d @- \ -H "Content-Type: application/json" \ @@ -48,7 +49,9 @@ jobs: key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} restore-keys: | ${{ runner.os }}-bazel- - - run: make build TARGETS="//:gopath" + - run: | + rm -rf bazel-bin/gopath + make build TARGETS="//:gopath" - run: tools/go_branch.sh - run: git checkout go && git clean -f - run: go build ./... diff --git a/.github/workflows/issue_reviver.yml b/.github/workflows/issue_reviver.yml index e68e15270..2b399a3f2 100644 --- a/.github/workflows/issue_reviver.yml +++ b/.github/workflows/issue_reviver.yml @@ -8,9 +8,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - if: github.repository == "google/gvisor" + if: github.repository == 'google/gvisor' - run: make run TARGETS="//tools/issue_reviver" - if: github.repository == "google/gvisor" + if: github.repository == 'google/gvisor' env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/.travis.yml b/.travis.yml index 1d955b05d..2d9fa80a1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -40,7 +40,7 @@ script: # On arm64, we need to create our own pipes for stderr and stdout, # otherwise we will not be able to open /dev/stderr. This is probably # due to AppArmor rules. - - bash -xeo pipefail -c 'uname -a && make smoke-test 2>&1 | cat' + - bash -xeo pipefail -c 'uname -a && make smoke-tests 2>&1 | cat' branches: except: # Skip copybara branches. @@ -30,7 +30,7 @@ doc( permalink = "/community/governance/", subcategory = "Community", visibility = ["//website:__pkg__"], - weight = "91", + weight = "20", ) doc( @@ -117,12 +117,12 @@ debian: ## Builds the debian packages. @$(call submake,build OPTIONS="-c opt" TARGETS="//runsc:runsc-debian") .PHONY: debian -smoke-test: ## Runs a simple smoke test after build runsc. +smoke-tests: ## Runs a simple smoke test after build runsc. @$(call submake,run DOCKER_PRIVILEGED="" ARGS="--alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do true") .PHONY: smoke-tests unit-tests: ## Local package unit tests in pkg/..., runsc/, tools/.., etc. - @$(call submake,test TARGETS="pkg/... runsc/... tools/... benchmarks/... benchmarks/runner:runner_test") + @$(call submake,test TARGETS="pkg/... runsc/... tools/...") tests: ## Runs all unit tests and syscall tests. tests: unit-tests @@ -155,7 +155,7 @@ syscall-tests: syscall-ptrace-tests syscall-kvm-tests syscall-native-tests %-runtime-tests: load-runtimes_% @$(call submake,install-test-runtime) - @$(call submake,test-runtime TARGETS="//test/runtimes:$*") + @$(call submake,test-runtime OPTIONS="--test_timeout=10800" TARGETS="//test/runtimes:$*") do-tests: runsc @$(call submake,run TARGETS="//runsc" ARGS="--rootless do true") @@ -185,7 +185,6 @@ swgso-tests: load-basic-images @$(call submake,install-test-runtime RUNTIME="swgso" ARGS="--software-gso=true --gso=false") @$(call submake,test-runtime RUNTIME="swgso" TARGETS="$(INTEGRATION_TARGETS)") .PHONY: swgso-tests - hostnet-tests: load-basic-images @$(call submake,install-test-runtime RUNTIME="hostnet" ARGS="--network=host") @$(call submake,test-runtime RUNTIME="hostnet" OPTIONS="--test_arg=-checkpoint=false" TARGETS="$(INTEGRATION_TARGETS)") @@ -211,14 +210,15 @@ packetdrill-tests: load-packetdrill .PHONY: packetdrill-tests packetimpact-tests: load-packetimpact + @sudo modprobe iptable_filter ip6table_filter @$(call submake,install-test-runtime RUNTIME="packetimpact") - @$(call submake,test-runtime RUNTIME="packetimpact" TARGETS="$(shell $(MAKE) query TARGETS='attr(tags, packetimpact, tests(//...))')") + @$(call submake,test-runtime OPTIONS="--jobs=HOST_CPUS*3 --local_test_jobs=HOST_CPUS*3" RUNTIME="packetimpact" TARGETS="$(shell $(MAKE) query TARGETS='attr(tags, packetimpact, tests(//...))')") .PHONY: packetimpact-tests root-tests: load-basic-images @$(call submake,install-test-runtime) @$(call submake,sudo TARGETS="//test/root:root_test" ARGS="-test.v") -.PHONY: test-root +.PHONY: root-tests # Specific containerd version tests. containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd install-test-runtime @@ -266,7 +266,7 @@ website-push: website-build ## Push a new image and update the service. website-deploy: website-push ## Deploy a new version of the website. @gcloud run deploy $(WEBSITE_SERVICE) --platform=managed --region=$(WEBSITE_REGION) --project=$(WEBSITE_PROJECT) --image=$(WEBSITE_IMAGE) -.PHONY: website-push +.PHONY: website-deploy ## ## Repository builders. @@ -349,7 +349,7 @@ dev: ## Installs a set of local runtimes. Requires sudo. refresh: ## Refreshes the runtime binary (for development only). Must have called 'dev' or 'install-test-runtime' first. @mkdir -p "$(RUNTIME_DIR)" @$(call submake,copy TARGETS=runsc DESTINATION="$(RUNTIME_BIN)") -.PHONY: install +.PHONY: refresh install-test-runtime: ## Installs the runtime for testing. Requires sudo. @$(call submake,refresh ARGS="--net-raw --TESTONLY-test-name-env=RUNSC_TEST_NAME --debug --strace --log-packets $(ARGS)") @@ -370,5 +370,5 @@ configure: ## Configures a single runtime. Requires sudo. Typically called from .PHONY: configure test-runtime: ## A convenient wrapper around test that provides the runtime argument. Target must still be provided. - @$(call submake,test OPTIONS="$(OPTIONS) --test_output=streamed --test_arg=--runtime=$(RUNTIME)") + @$(call submake,test OPTIONS="$(OPTIONS) --test_arg=--runtime=$(RUNTIME)") .PHONY: test-runtime @@ -58,7 +58,7 @@ Make sure the following dependencies are installed: Build and install the `runsc` binary: -``` +```sh make runsc sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin ``` @@ -67,14 +67,14 @@ sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin To run standard test suites, you can use: -``` +```sh make unit-tests make tests ``` To run specific tests, you can specify the target: -``` +```sh make test TARGETS="//runsc:version_test" ``` @@ -84,12 +84,19 @@ This project uses [bazel][bazel] to build and manage dependencies. A synthetic `go` branch is maintained that is compatible with standard `go` tooling for convenience. -For example, to build `runsc` directly from this branch: +For example, to build and install `runsc` directly from this branch: -``` +```sh echo "module runsc" > go.mod GO111MODULE=on go get gvisor.dev/gvisor/runsc@go -CGO_ENABLED=0 GO111MODULE=on go install gvisor.dev/gvisor/runsc +CGO_ENABLED=0 GO111MODULE=on sudo -E go build -o /usr/local/bin/runsc gvisor.dev/gvisor/runsc +``` + +Subsequently, you can build and install the shim binaries for `containerd`: + +```sh +GO111MODULE=on sudo -E go build -o /usr/local/bin/gvisor-containerd-shim gvisor.dev/gvisor/shim/v1 +GO111MODULE=on sudo -E go build -o /usr/local/bin/containerd-shim-runsc-v1 gvisor.dev/gvisor/shim/v2 ``` Note that this branch is supported in a best effort capacity, and direct @@ -113,26 +113,6 @@ rules_proto_dependencies() rules_proto_toolchains() -# Load python dependencies. -git_repository( - name = "rules_python", - commit = "abc4869e02fe9b3866942e89f07b7341f830e805", - remote = "https://github.com/bazelbuild/rules_python.git", - shallow_since = "1583341286 -0500", -) - -load("@rules_python//python:pip.bzl", "pip_import") - -pip_import( - name = "pydeps", - python_interpreter = "python3", - requirements = "//benchmarks:requirements.txt", -) - -load("@pydeps//:requirements.bzl", "pip_install") - -pip_install() - # Load bazel_toolchain to support Remote Build Execution. # See releases at https://releases.bazel.build/bazel-toolchains.html http_archive( @@ -1089,3 +1069,10 @@ go_repository( sum = "h1:iVjPR7a6H0tWELX5NxNe7bYopibicUzc7uPribsnS6o=", version = "v1.0.0", ) + +go_repository( + name = "com_github_dpjacques_clockwork", + importpath = "github.com/dpjacques/clockwork", + sum = "h1:7krODee+eIlZYoLiEDmP1kLFNCvd0bQ0eEXOympdN6U=", + version = "v0.1.1-0.20190114191937-d864eecc357b", +) diff --git a/benchmarks/BUILD b/benchmarks/BUILD deleted file mode 100644 index 389351210..000000000 --- a/benchmarks/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -package(licenses = ["notice"]) - -config_setting( - name = "gcloud_rule", - values = { - "define": "gcloud=off", - }, -) - -py_binary( - name = "benchmarks", - testonly = 1, - srcs = ["run.py"], - data = select({ - ":gcloud_rule": [], - "//conditions:default": [ - "//tools/vm:ubuntu1604", - "//tools/vm:zone", - ], - }), - main = "run.py", - python_version = "PY3", - srcs_version = "PY3", - tags = [ - "local", - "manual", - ], - deps = ["//benchmarks/runner"], -) diff --git a/benchmarks/README.md b/benchmarks/README.md deleted file mode 100644 index 814bcb220..000000000 --- a/benchmarks/README.md +++ /dev/null @@ -1,186 +0,0 @@ -# Benchmark tools - -These scripts are tools for collecting performance data for Docker-based tests. - -## Setup - -The scripts assume the following: - -* There are two sets of machines: one where the scripts will be run - (controller) and one or more machines on which docker containers will be run - (environment). -* The controller machine must have bazel installed along with this source - code. You should be able to run a command like `bazel run //benchmarks -- - --list` -* Environment machines must have docker and the required runtimes installed. - More specifically, you should be able to run a command like: `docker run - --runtime=$RUNTIME your/image`. -* The controller has ssh private key which can be used to login to environment - machines and run docker commands without using `sudo`. This is not required - if running locally via the `run-local` command. -* The docker daemon on each of your environment machines is listening on - `unix:///var/run/docker.sock` (docker's default). - -For configuring the environment manually, consult the -[dockerd documentation][dockerd]. - -## Running benchmarks - -### Locally - -The tool is built to, by default, use Google Cloud Platform to run benchmarks, -but it does support GCP workflows. To run locally, run the following from the -benchmarks directory: - -```bash -bazel run --define gcloud=off //benchmarks -- run-local startup - -... -method,metric,result -startup.empty,startup_time_ms,652.5772 -startup.node,startup_time_ms,1654.4042000000002 -startup.ruby,startup_time_ms,1429.835 -``` - -The above command ran the startup benchmark locally, which consists of three -benchmarks (empty, node, and ruby). Benchmark tools ran it on the default -runtime, runc. Running on another installed runtime, like say runsc, is as -simple as: - -```bash -bazel run --define gcloud=off //benchmarks -- run-local startup --runtime=runsc -``` - -There is help: - -```bash -bazel run --define gcloud=off //benchmarks -- --help -bazel run --define gcloud=off //benchmarks -- run-local --help -``` - -To list available benchmarks, use the `list` commmand: - -```bash -bazel --define gcloud=off run //benchmarks -- list - -... -Benchmark: sysbench.cpu -Metrics: events_per_second - Run sysbench CPU test. Additional arguments can be provided for sysbench. - - :param max_prime: The maximum prime number to search. -``` - -You can choose benchmarks by name or regex like: - -```bash -bazel run --define gcloud=off //benchmarks -- run-local startup.node -... -metric,result -startup_time_ms,1671.7178000000001 - -``` - -or - -```bash -bazel run --define gcloud=off //benchmarks -- run-local s -... -method,metric,result -startup.empty,startup_time_ms,1792.8292 -startup.node,startup_time_ms,3113.5274 -startup.ruby,startup_time_ms,3025.2424 -sysbench.cpu,cpu_events_per_second,12661.47 -sysbench.memory,memory_ops_per_second,7228268.44 -sysbench.mutex,mutex_time,17.4835 -sysbench.mutex,mutex_latency,3496.7 -sysbench.mutex,mutex_deviation,0.04 -syscall.syscall,syscall_time_ns,2065.0 -``` - -You can run parameterized benchmarks, for example to run with different -runtimes: - -```bash -bazel run --define gcloud=off //benchmarks -- run-local --runtime=runc --runtime=runsc sysbench.cpu -``` - -Or with different parameters: - -```bash -bazel run --define gcloud=off //benchmarks -- run-local --max_prime=10 --max_prime=100 sysbench.cpu -``` - -### On Google Compute Engine (GCE) - -Benchmarks may be run on GCE in an automated way. The default project configured -for `gcloud` will be used. - -An additional parameter `installers` may be provided to ensure that the latest -runtime is installed from the workspace. See the files in `tools/installers` for -supported install targets. - -```bash -bazel run //benchmarks -- run-gcp --installers=head --runtime=runsc sysbench.cpu -``` - -When running on GCE, the scripts generate a per run SSH key, which is added to -your project. The key is set to expire in GCE after 60 minutes and is stored in -a temporary directory on the local machine running the scripts. - -## Writing benchmarks - -To write new benchmarks, you should familiarize yourself with the structure of -the repository. There are three key components. - -## Harness - -The harness makes use of the [docker py SDK][docker-py]. It is advisable that -you familiarize yourself with that API when making changes, specifically: - -* clients -* containers -* images - -In general, benchmarks need only interact with the `Machine` objects provided to -the benchmark function, which are the machines defined in the environment. These -objects allow the benchmark to define the relationships between different -containers, and parse the output. - -## Workloads - -The harness requires workloads to run. These are all available in the -`workloads` directory. - -In general, a workload consists of a Dockerfile to build it (while these are not -hermetic, in general they should be as fixed and isolated as possible), some -parsers for output if required, parser tests and sample data. Provided the test -is named after the workload package and contains a function named `sample`, this -variable will be used to automatically mock workload output when the `--mock` -flag is provided to the main tool. - -## Writing benchmarks - -Benchmarks define the tests themselves. All benchmarks have the following -function signature: - -```python -def my_func(output) -> float: - return float(output) - -@benchmark(metrics = my_func, machines = 1) -def my_benchmark(machine: machine.Machine, arg: str): - return "3.4432" -``` - -Each benchmark takes a variable amount of position arguments as -`harness.Machine` objects and some set of keyword arguments. It is recommended -that you accept arbitrary keyword arguments and pass them through when -constructing the container under test. - -To write a new benchmark, open a module in the `suites` directory and use the -above signature. You should add a descriptive doc string to describe what your -benchmark is and any test centric arguments. - -[dockerd]: https://docs.docker.com/engine/reference/commandline/dockerd/ -[docker-py]: https://docker-py.readthedocs.io/en/stable/ diff --git a/benchmarks/defs.bzl b/benchmarks/defs.bzl deleted file mode 100644 index 56d28223e..000000000 --- a/benchmarks/defs.bzl +++ /dev/null @@ -1,14 +0,0 @@ -"""Provides attributes common to many workload tests.""" - -load("//tools:defs.bzl", "py_requirement") - -test_deps = [ - py_requirement("attrs", direct = False), - py_requirement("atomicwrites", direct = False), - py_requirement("more-itertools", direct = False), - py_requirement("pathlib2", direct = False), - py_requirement("pluggy", direct = False), - py_requirement("py", direct = False), - py_requirement("pytest"), - py_requirement("six", direct = False), -] diff --git a/benchmarks/examples/localhost.yaml b/benchmarks/examples/localhost.yaml deleted file mode 100644 index f70fe0fb7..000000000 --- a/benchmarks/examples/localhost.yaml +++ /dev/null @@ -1,2 +0,0 @@ -client: localhost -server: localhost diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD deleted file mode 100644 index 2090d957a..000000000 --- a/benchmarks/harness/BUILD +++ /dev/null @@ -1,201 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -pkg_tar( - name = "installers", - srcs = [ - "//tools/installers:head", - "//tools/installers:master", - ], - mode = "0755", -) - -filegroup( - name = "files", - srcs = [ - ":installers", - ], -) - -py_library( - name = "harness", - srcs = ["__init__.py"], - data = [ - ":files", - ], -) - -py_library( - name = "benchmark_driver", - srcs = ["benchmark_driver.py"], - deps = [ - "//benchmarks/harness/machine_mocks", - "//benchmarks/harness/machine_producers:machine_producer", - "//benchmarks/suites", - ], -) - -py_library( - name = "container", - srcs = ["container.py"], - deps = [ - "//benchmarks/workloads", - py_requirement( - "asn1crypto", - direct = False, - ), - py_requirement( - "chardet", - direct = False, - ), - py_requirement( - "certifi", - direct = False, - ), - py_requirement("docker"), - py_requirement( - "docker-pycreds", - direct = False, - ), - py_requirement( - "idna", - direct = False, - ), - py_requirement( - "ptyprocess", - direct = False, - ), - py_requirement( - "requests", - direct = False, - ), - py_requirement( - "urllib3", - direct = False, - ), - py_requirement( - "websocket-client", - direct = False, - ), - ], -) - -py_library( - name = "machine", - srcs = ["machine.py"], - deps = [ - "//benchmarks/harness", - "//benchmarks/harness:container", - "//benchmarks/harness:ssh_connection", - "//benchmarks/harness:tunnel_dispatcher", - "//benchmarks/harness/machine_mocks", - py_requirement( - "asn1crypto", - direct = False, - ), - py_requirement( - "chardet", - direct = False, - ), - py_requirement( - "certifi", - direct = False, - ), - py_requirement("docker"), - py_requirement( - "docker-pycreds", - direct = False, - ), - py_requirement( - "idna", - direct = False, - ), - py_requirement( - "ptyprocess", - direct = False, - ), - py_requirement( - "requests", - direct = False, - ), - py_requirement( - "six", - direct = False, - ), - py_requirement( - "urllib3", - direct = False, - ), - py_requirement( - "websocket-client", - direct = False, - ), - ], -) - -py_library( - name = "ssh_connection", - srcs = ["ssh_connection.py"], - deps = [ - "//benchmarks/harness", - py_requirement( - "bcrypt", - direct = False, - ), - py_requirement("cffi"), - py_requirement("paramiko"), - py_requirement( - "cryptography", - direct = False, - ), - ], -) - -py_library( - name = "tunnel_dispatcher", - srcs = ["tunnel_dispatcher.py"], - deps = [ - py_requirement( - "asn1crypto", - direct = False, - ), - py_requirement( - "chardet", - direct = False, - ), - py_requirement( - "certifi", - direct = False, - ), - py_requirement("docker"), - py_requirement( - "docker-pycreds", - direct = False, - ), - py_requirement( - "idna", - direct = False, - ), - py_requirement("pexpect"), - py_requirement( - "ptyprocess", - direct = False, - ), - py_requirement( - "requests", - direct = False, - ), - py_requirement( - "urllib3", - direct = False, - ), - py_requirement( - "websocket-client", - direct = False, - ), - ], -) diff --git a/benchmarks/harness/__init__.py b/benchmarks/harness/__init__.py deleted file mode 100644 index 15aa2a69a..000000000 --- a/benchmarks/harness/__init__.py +++ /dev/null @@ -1,62 +0,0 @@ -# python3 -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Core benchmark utilities.""" - -import getpass -import os -import subprocess -import tempfile - -# LOCAL_WORKLOADS_PATH defines the path to use for local workloads. This is a -# format string that accepts a single string parameter. -LOCAL_WORKLOADS_PATH = os.path.dirname(__file__) + "/../workloads/{}/tar.tar" - -# REMOTE_WORKLOADS_PATH defines the path to use for storing the workloads on the -# remote host. This is a format string that accepts a single string parameter. -REMOTE_WORKLOADS_PATH = "workloads/{}" - -# INSTALLER_ROOT is the set of files that needs to be copied. -INSTALLER_ARCHIVE = os.readlink(os.path.join( - os.path.dirname(__file__), "installers.tar")) - -# SSH_KEY_DIR holds SSH_PRIVATE_KEY for this run. bm-tools paramiko requires -# keys generated with the '-t rsa -m PEM' options from ssh-keygen. This is -# abstracted away from the user. -SSH_KEY_DIR = tempfile.TemporaryDirectory() -SSH_PRIVATE_KEY = "key" - -# DEFAULT_USER is the default user running this script. -DEFAULT_USER = getpass.getuser() - -# DEFAULT_USER_HOME is the home directory of the user running the script. -DEFAULT_USER_HOME = os.environ["HOME"] if "HOME" in os.environ else "" - -# Default directory to remotely installer "installer" targets. -REMOTE_INSTALLERS_PATH = "installers" - - -def make_key(): - """Wraps a valid ssh key in a temporary directory.""" - path = os.path.join(SSH_KEY_DIR.name, SSH_PRIVATE_KEY) - if not os.path.exists(path): - cmd = "ssh-keygen -t rsa -m PEM -b 4096 -f {key} -q -N".format( - key=path).split(" ") - cmd.append("") - subprocess.run(cmd, check=True) - return path - - -def delete_key(): - """Deletes temporary directory containing private key.""" - SSH_KEY_DIR.cleanup() diff --git a/benchmarks/harness/benchmark_driver.py b/benchmarks/harness/benchmark_driver.py deleted file mode 100644 index 9abc21b54..000000000 --- a/benchmarks/harness/benchmark_driver.py +++ /dev/null @@ -1,85 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Main driver for benchmarks.""" - -import copy -import statistics -import threading -import types - -from benchmarks import suites -from benchmarks.harness.machine_producers import machine_producer - - -# pylint: disable=too-many-instance-attributes -class BenchmarkDriver: - """Allocates machines and invokes a benchmark method.""" - - def __init__(self, - producer: machine_producer.MachineProducer, - method: types.FunctionType, - runs: int = 1, - **kwargs): - - self._producer = producer - self._method = method - self._kwargs = copy.deepcopy(kwargs) - self._threads = [] - self.lock = threading.RLock() - self._runs = runs - self._metric_results = {} - - def start(self): - """Starts a benchmark thread.""" - for _ in range(self._runs): - thread = threading.Thread(target=self._run_method) - thread.start() - self._threads.append(thread) - - def join(self): - """Joins the thread.""" - # pylint: disable=expression-not-assigned - [t.join() for t in self._threads] - - def _run_method(self): - """Runs all benchmarks.""" - machines = self._producer.get_machines( - suites.benchmark_machines(self._method)) - try: - result = self._method(*machines, **self._kwargs) - for name, res in result: - with self.lock: - if name in self._metric_results: - self._metric_results[name].append(res) - else: - self._metric_results[name] = [res] - finally: - # Always release. - self._producer.release_machines(machines) - - def median(self): - """Returns the median result, after join is finished.""" - for key, value in self._metric_results.items(): - yield key, [statistics.median(value)] - - def all(self): - """Returns all results.""" - for key, value in self._metric_results.items(): - yield key, value - - def meanstd(self): - """Returns all results.""" - for key, value in self._metric_results.items(): - mean = statistics.mean(value) - yield key, [mean, statistics.stdev(value, xbar=mean)] diff --git a/benchmarks/harness/container.py b/benchmarks/harness/container.py deleted file mode 100644 index 585436e20..000000000 --- a/benchmarks/harness/container.py +++ /dev/null @@ -1,181 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Container definitions.""" - -import contextlib -import logging -import pydoc -import types -from typing import Tuple - -import docker -import docker.errors - -from benchmarks import workloads - - -class Container: - """Abstract container. - - Must be a context manager. - - Usage: - - with Container(client, image, ...): - ... - """ - - def run(self, **env) -> str: - """Run the container synchronously.""" - raise NotImplementedError - - def detach(self, **env): - """Run the container asynchronously.""" - raise NotImplementedError - - def address(self) -> Tuple[str, int]: - """Return the bound address for the container.""" - raise NotImplementedError - - def get_names(self) -> types.GeneratorType: - """Return names of all containers.""" - raise NotImplementedError - - -# pylint: disable=too-many-instance-attributes -class DockerContainer(Container): - """Class that handles creating a docker container.""" - - # pylint: disable=too-many-arguments - def __init__(self, - client: docker.DockerClient, - host: str, - image: str, - count: int = 1, - runtime: str = "runc", - port: int = 0, - **kwargs): - """Trys to setup "count" containers. - - Args: - client: A docker client from dockerpy. - host: The host address the image is running on. - image: The name of the image to run. - count: The number of containers to setup. - runtime: The container runtime to use. - port: The port to reserve. - **kwargs: Additional container options. - """ - assert count >= 1 - assert port == 0 or count == 1 - self._client = client - self._host = host - self._containers = [] - self._count = count - self._image = image - self._runtime = runtime - self._port = port - self._kwargs = kwargs - if port != 0: - self._ports = {"%d/tcp" % port: None} - else: - self._ports = {} - - @contextlib.contextmanager - def detach(self, **env): - env = ["%s=%s" % (key, value) for (key, value) in env.items()] - # Start all containers. - for _ in range(self._count): - try: - # Start the container in a detached mode. - container = self._client.containers.run( - self._image, - detach=True, - remove=True, - runtime=self._runtime, - ports=self._ports, - environment=env, - **self._kwargs) - logging.info("Started detached container %s -> %s", self._image, - container.attrs["Id"]) - self._containers.append(container) - except Exception as exc: - self._clean_containers() - raise exc - try: - # Wait for all containers to be up. - for container in self._containers: - while not container.attrs["State"]["Running"]: - container = self._client.containers.get(container.attrs["Id"]) - yield self - finally: - self._clean_containers() - - def address(self) -> Tuple[str, int]: - assert self._count == 1 - assert self._port != 0 - container = self._client.containers.get(self._containers[0].attrs["Id"]) - port = container.attrs["NetworkSettings"]["Ports"][ - "%d/tcp" % self._port][0]["HostPort"] - return (self._host, port) - - def get_names(self) -> types.GeneratorType: - for container in self._containers: - yield container.name - - def run(self, **env) -> str: - env = ["%s=%s" % (key, value) for (key, value) in env.items()] - return self._client.containers.run( - self._image, - runtime=self._runtime, - ports=self._ports, - remove=True, - environment=env, - **self._kwargs).decode("utf-8") - - def _clean_containers(self): - """Kills all containers.""" - for container in self._containers: - try: - container.kill() - except docker.errors.NotFound: - pass - - -class MockContainer(Container): - """Mock of Container.""" - - def __init__(self, workload: str): - self._workload = workload - - def __enter__(self): - return self - - def run(self, **env): - # Lookup sample data if any exists for the workload module. We use a - # well-defined test locate and a well-defined sample function. - mod = pydoc.locate(workloads.__name__ + "." + self._workload) - if hasattr(mod, "sample"): - return mod.sample(**env) - return "" # No output. - - def address(self) -> Tuple[str, int]: - return ("example.com", 80) - - def get_names(self) -> types.GeneratorType: - yield "mock" - - @contextlib.contextmanager - def detach(self, **env): - yield self diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py deleted file mode 100644 index 5bdc4aa85..000000000 --- a/benchmarks/harness/machine.py +++ /dev/null @@ -1,265 +0,0 @@ -# python3 -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Machine abstraction passed to benchmarks to run docker containers. - -Abstraction for interacting with test machines. Machines are produced -by Machine producers and represent a local or remote machine. Benchmark -methods in /benchmarks/suite are passed the required number of machines in order -to run the benchmark. Machines contain methods to run commands via bash, -possibly over ssh. Machines also hold a connection to the docker UNIX socket -to run contianers. - - Typical usage example: - - machine = Machine() - machine.run(cmd) - machine.pull(path) - container = machine.container() -""" - -import logging -import os -import re -import subprocess -import time -from typing import List, Tuple - -import docker - -from benchmarks import harness -from benchmarks.harness import container -from benchmarks.harness import machine_mocks -from benchmarks.harness import ssh_connection -from benchmarks.harness import tunnel_dispatcher - -log = logging.getLogger(__name__) - - -class Machine(object): - """The machine object is the primary object for benchmarks. - - Machine objects are passed to each metric function call and benchmarks use - machines to access real connections to those machines. - - Attributes: - _name: Name as a string - """ - _name = "" - - def run(self, cmd: str) -> Tuple[str, str]: - """Convenience method for running a bash command on a machine object. - - Some machines may point to the local machine, and thus, do not have ssh - connections. Run runs a command either local or over ssh and returns the - output stdout and stderr as strings. - - Args: - cmd: The command to run as a string. - - Returns: - The command output. - """ - raise NotImplementedError - - def read(self, path: str) -> str: - """Reads the contents of some file. - - This will be mocked. - - Args: - path: The path to the file to be read. - - Returns: - The file contents. - """ - raise NotImplementedError - - def pull(self, workload: str) -> str: - """Send the given workload to the machine, build and tag it. - - All images must be defined by the workloads directory. - - Args: - workload: The workload name. - - Returns: - The workload tag. - """ - raise NotImplementedError - - def container(self, image: str, **kwargs) -> container.Container: - """Returns a container object. - - Args: - image: The pulled image tag. - **kwargs: Additional container options. - - Returns: - :return: a container.Container object. - """ - raise NotImplementedError - - def sleep(self, amount: float): - """Sleeps the given amount of time.""" - time.sleep(amount) - - def __str__(self): - return self._name - - -class MockMachine(Machine): - """A mocked machine.""" - _name = "mock" - - def run(self, cmd: str) -> Tuple[str, str]: - return "", "" - - def read(self, path: str) -> str: - return machine_mocks.Readfile(path) - - def pull(self, workload: str) -> str: - return workload # Workload is the tag. - - def container(self, image: str, **kwargs) -> container.Container: - return container.MockContainer(image) - - def sleep(self, amount: float): - pass - - -def get_address(machine: Machine) -> str: - """Return a machine's default address.""" - default_route, _ = machine.run("ip route get 8.8.8.8") - return re.search(" src ([0-9.]+) ", default_route).group(1) - - -class LocalMachine(Machine): - """The local machine. - - Attributes: - _name: Name as a string - _docker_client: a pythonic connection to to the local dockerd unix socket. - See: https://github.com/docker/docker-py - """ - - def __init__(self, name): - self._name = name - self._docker_client = docker.from_env() - - def run(self, cmd: str) -> Tuple[str, str]: - process = subprocess.Popen( - cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = process.communicate() - return stdout.decode("utf-8"), stderr.decode("utf-8") - - def read(self, path: str) -> bytes: - # Read the exact path locally. - return open(path, "r").read() - - def pull(self, workload: str) -> str: - # Run the docker build command locally. - logging.info("Building %s@%s locally...", workload, self._name) - with open(harness.LOCAL_WORKLOADS_PATH.format(workload), - "rb") as dockerfile: - self._docker_client.images.build( - fileobj=dockerfile, tag=workload, custom_context=True) - return workload # Workload is the tag. - - def container(self, image: str, **kwargs) -> container.Container: - # Return a local docker container directly. - return container.DockerContainer(self._docker_client, get_address(self), - image, **kwargs) - - def sleep(self, amount: float): - time.sleep(amount) - - -class RemoteMachine(Machine): - """Remote machine accessible via an SSH connection. - - Attributes: - _name: Name as a string - _ssh_connection: a paramiko backed ssh connection which can be used to run - commands on this machine - _tunnel: a python wrapper around a port forwarded ssh connection between a - local unix socket and the remote machine's dockerd unix socket. - _docker_client: a pythonic wrapper backed by the _tunnel. Allows sending - docker commands: see https://github.com/docker/docker-py - """ - - def __init__(self, name, **kwargs): - self._name = name - self._ssh_connection = ssh_connection.SSHConnection(name, **kwargs) - self._tunnel = tunnel_dispatcher.Tunnel(name, **kwargs) - self._tunnel.connect() - self._docker_client = self._tunnel.get_docker_client() - self._has_installers = False - - def run(self, cmd: str) -> Tuple[str, str]: - return self._ssh_connection.run(cmd) - - def read(self, path: str) -> str: - # Just cat remotely. - stdout, stderr = self._ssh_connection.run("cat '{}'".format(path)) - return stdout + stderr - - def install(self, - installer: str, - results: List[bool] = None, - index: int = -1): - """Method unique to RemoteMachine to handle installation of installers. - - Handles installers, which install things that may change between runs (e.g. - runsc). Usually called from gcloud_producer, which expects this method to - to store results. - - Args: - installer: the installer target to run. - results: Passed by the caller of where to store success. - index: Index for this method to store the result in the passed results - list. - """ - # This generates a tarball of the full installer root (which will generate - # be the full bazel root directory) and sends it over. - if not self._has_installers: - archive = self._ssh_connection.send_installers() - self.run("tar -xvf {archive} -C {dir}".format( - archive=archive, dir=harness.REMOTE_INSTALLERS_PATH)) - self._has_installers = True - - # Execute the remote installer. - self.run("sudo {dir}/{file}".format( - dir=harness.REMOTE_INSTALLERS_PATH, file=installer)) - - if results: - results[index] = True - - def pull(self, workload: str) -> str: - # Push to the remote machine and build. - logging.info("Building %s@%s remotely...", workload, self._name) - remote_path = self._ssh_connection.send_workload(workload) - remote_dir = os.path.dirname(remote_path) - # Workloads are all tarballs. - self.run("tar -xvf {remote_path} -C {remote_dir}".format( - remote_path=remote_path, remote_dir=remote_dir)) - self.run("docker build --tag={} {}".format(workload, remote_dir)) - return workload # Workload is the tag. - - def container(self, image: str, **kwargs) -> container.Container: - # Return a remote docker container. - return container.DockerContainer(self._docker_client, get_address(self), - image, **kwargs) - - def sleep(self, amount: float): - time.sleep(amount) diff --git a/benchmarks/harness/machine_mocks/BUILD b/benchmarks/harness/machine_mocks/BUILD deleted file mode 100644 index c8ec4bc79..000000000 --- a/benchmarks/harness/machine_mocks/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "machine_mocks", - srcs = ["__init__.py"], -) diff --git a/benchmarks/harness/machine_mocks/__init__.py b/benchmarks/harness/machine_mocks/__init__.py deleted file mode 100644 index 00f0085d7..000000000 --- a/benchmarks/harness/machine_mocks/__init__.py +++ /dev/null @@ -1,81 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Machine mock files.""" - -MEMINFO = """\ -MemTotal: 7652344 kB -MemFree: 7174724 kB -MemAvailable: 7152008 kB -Buffers: 7544 kB -Cached: 178856 kB -SwapCached: 0 kB -Active: 270928 kB -Inactive: 68436 kB -Active(anon): 153124 kB -Inactive(anon): 880 kB -Active(file): 117804 kB -Inactive(file): 67556 kB -Unevictable: 0 kB -Mlocked: 0 kB -SwapTotal: 0 kB -SwapFree: 0 kB -Dirty: 900 kB -Writeback: 0 kB -AnonPages: 153000 kB -Mapped: 129120 kB -Shmem: 1044 kB -Slab: 60864 kB -SReclaimable: 22792 kB -SUnreclaim: 38072 kB -KernelStack: 2672 kB -PageTables: 5756 kB -NFS_Unstable: 0 kB -Bounce: 0 kB -WritebackTmp: 0 kB -CommitLimit: 3826172 kB -Committed_AS: 663836 kB -VmallocTotal: 34359738367 kB -VmallocUsed: 0 kB -VmallocChunk: 0 kB -HardwareCorrupted: 0 kB -AnonHugePages: 0 kB -ShmemHugePages: 0 kB -ShmemPmdMapped: 0 kB -CmaTotal: 0 kB -CmaFree: 0 kB -HugePages_Total: 0 -HugePages_Free: 0 -HugePages_Rsvd: 0 -HugePages_Surp: 0 -Hugepagesize: 2048 kB -DirectMap4k: 94196 kB -DirectMap2M: 4624384 kB -DirectMap1G: 3145728 kB -""" - -CONTENTS = { - "/proc/meminfo": MEMINFO, -} - - -def Readfile(path: str) -> str: - """Reads a mock file. - - Args: - path: The target path. - - Returns: - Mocked file contents or None. - """ - return CONTENTS.get(path, None) diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD deleted file mode 100644 index 81f19bd08..000000000 --- a/benchmarks/harness/machine_producers/BUILD +++ /dev/null @@ -1,84 +0,0 @@ -load("//tools:defs.bzl", "py_library", "py_requirement") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "harness", - srcs = ["__init__.py"], -) - -py_library( - name = "machine_producer", - srcs = ["machine_producer.py"], -) - -py_library( - name = "mock_producer", - srcs = ["mock_producer.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/harness/machine_producers:gcloud_producer", - "//benchmarks/harness/machine_producers:machine_producer", - ], -) - -py_library( - name = "yaml_producer", - srcs = ["yaml_producer.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/harness/machine_producers:machine_producer", - py_requirement( - "PyYAML", - direct = False, - ), - ], -) - -py_library( - name = "gcloud_mock_recorder", - srcs = ["gcloud_mock_recorder.py"], -) - -py_library( - name = "gcloud_producer", - srcs = ["gcloud_producer.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/harness/machine_producers:gcloud_mock_recorder", - "//benchmarks/harness/machine_producers:machine_producer", - ], -) - -filegroup( - name = "test_data", - srcs = [ - "testdata/get_five.json", - "testdata/get_one.json", - ], -) - -py_library( - name = "gcloud_producer_test_lib", - srcs = ["gcloud_producer_test.py"], - deps = [ - "//benchmarks/harness/machine_producers:machine_producer", - "//benchmarks/harness/machine_producers:mock_producer", - ], -) - -py_test( - name = "gcloud_producer_test", - srcs = [":gcloud_producer_test_lib"], - data = [ - ":test_data", - ], - python_version = "PY3", - tags = [ - "local", - "manual", - ], -) diff --git a/benchmarks/harness/machine_producers/__init__.py b/benchmarks/harness/machine_producers/__init__.py deleted file mode 100644 index 634ef4843..000000000 --- a/benchmarks/harness/machine_producers/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/benchmarks/harness/machine_producers/gcloud_mock_recorder.py b/benchmarks/harness/machine_producers/gcloud_mock_recorder.py deleted file mode 100644 index fd9837a37..000000000 --- a/benchmarks/harness/machine_producers/gcloud_mock_recorder.py +++ /dev/null @@ -1,97 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""A recorder and replay for testing the GCloudProducer. - -MockPrinter and MockReader handle printing and reading mock data for the -purposes of testing. MockPrinter is passed to GCloudProducer objects. The user -can then run scenarios and record them for playback in tests later. - -MockReader is passed to MockGcloudProducer objects and handles reading the -previously recorded mock data. - -It is left to the user to check if data printed is properly redacted for their -own use. The intended usecase for this class is data coming from gcloud -commands, which will contain public IPs and other instance data. - -The data format is json and printed/read from the ./test_data directory. The -data is the output of subprocess.CompletedProcess objects in json format. - - Typical usage example: - - recorder = MockPrinter() - producer = GCloudProducer(args, recorder) - machines = producer.get_machines(1) - with open("my_file.json") as fd: - recorder.write_out(fd) - - reader = MockReader(filename) - producer = MockGcloudProducer(args, mock) - machines = producer.get_machines(1) - assert len(machines) == 1 -""" - -import io -import json -import subprocess - - -class MockPrinter(object): - """Handles printing Mock data for MockGcloudProducer. - - Attributes: - _records: list of json object records for printing - """ - - def __init__(self): - self._records = [] - - def record(self, entry: subprocess.CompletedProcess): - """Records data and strips out ip addresses.""" - - record = { - "args": entry.args, - "stdout": entry.stdout.decode("utf-8"), - "returncode": str(entry.returncode) - } - self._records.append(record) - - def write_out(self, fd: io.FileIO): - """Prints out the data into the given filepath.""" - fd.write(json.dumps(self._records, indent=4)) - - -class MockReader(object): - """Handles reading Mock data for MockGcloudProducer. - - Attributes: - _records: List[json] records read from the passed in file. - """ - - def __init__(self, filepath: str): - with open(filepath, "rb") as file: - self._records = json.loads(file.read()) - self._i = 0 - - def __iter__(self): - return self - - def __next__(self, args) -> subprocess.CompletedProcess: - """Returns the next record as a CompletedProcess.""" - if self._i < len(self._records): - record = self._records[self._i] - stdout = record["stdout"].encode("ascii") - returncode = int(record["returncode"]) - return subprocess.CompletedProcess( - args=args, returncode=returncode, stdout=stdout, stderr=b"") - raise StopIteration() diff --git a/benchmarks/harness/machine_producers/gcloud_producer.py b/benchmarks/harness/machine_producers/gcloud_producer.py deleted file mode 100644 index 44d72f575..000000000 --- a/benchmarks/harness/machine_producers/gcloud_producer.py +++ /dev/null @@ -1,250 +0,0 @@ -# python3 -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""A machine producer which produces machine objects using `gcloud`. - -Machine producers produce valid harness.Machine objects which are backed by -real machines. This producer produces those machines on the given user's GCP -account using the `gcloud` tool. - -GCloudProducer creates instances on the given GCP account named like: -`machine-XXXXXXX-XXXX-XXXX-XXXXXXXXXXXX` in a randomized fashion such that name -collisions with user instances shouldn't happen. - - Typical usage example: - - producer = GCloudProducer(args) - machines = producer.get_machines(NUM_MACHINES) - # run stuff on machines with machines[i].run(CMD) - producer.release_machines(NUM_MACHINES) -""" -import datetime -import json -import subprocess -import threading -from typing import List, Dict, Any -import uuid - -from benchmarks.harness import machine -from benchmarks.harness.machine_producers import gcloud_mock_recorder -from benchmarks.harness.machine_producers import machine_producer - - -class GCloudProducer(machine_producer.MachineProducer): - """Implementation of MachineProducer backed by GCP. - - Produces Machine objects backed by GCP instances. - - Attributes: - image: image name as a string. - zone: string to a valid GCP zone. - machine_type: type of GCP to create (e.g. n1-standard-4). - installers: list of installers post-boot. - ssh_key_file: path to a valid ssh private key. See README on vaild ssh keys. - ssh_user: string of user name for ssh_key - ssh_password: string of password for ssh key - internal: if true, use internal IPs of instances. Used if bm-tools is - running on a GCP vm when a firewall is set for external IPs. - mock: a mock printer which will print mock data if required. Mock data is - recorded output from subprocess calls (returncode, stdout, args). - condition: mutex for this class around machine creation and deleteion. - """ - - def __init__(self, - image: str, - zone: str, - machine_type: str, - installers: List[str], - ssh_key_file: str, - ssh_user: str, - ssh_password: str, - internal: bool, - mock: gcloud_mock_recorder.MockPrinter = None): - self.image = image - self.zone = zone - self.machine_type = machine_type - self.installers = installers - self.ssh_key_file = ssh_key_file - self.ssh_user = ssh_user - self.ssh_password = ssh_password - self.internal = internal - self.mock = mock - self.condition = threading.Condition() - - def get_machines(self, num_machines: int) -> List[machine.Machine]: - """Returns requested number of machines backed by GCP instances.""" - if num_machines <= 0: - raise ValueError( - "Cannot ask for {num} machines!".format(num=num_machines)) - with self.condition: - names = self._get_unique_names(num_machines) - instances = self._build_instances(names) - self._add_ssh_key_to_instances(names) - machines = self._machines_from_instances(instances) - - # Install all bits in lock-step. - # - # This will perform paralell installations for however many machines we - # have, but it's easy to track errors because if installing (a, b, c), we - # won't install "c" until "b" is installed on all machines. - for installer in self.installers: - threads = [None] * len(machines) - results = [False] * len(machines) - for i in range(len(machines)): - threads[i] = threading.Thread( - target=machines[i].install, args=(installer, results, i)) - threads[i].start() - for thread in threads: - thread.join() - for result in results: - if not result: - raise NotImplementedError( - "Installers failed on at least one machine!") - - # Add this user to each machine's docker group. - for m in machines: - m.run("sudo setfacl -m user:$USER:rw /var/run/docker.sock") - - return machines - - def release_machines(self, machine_list: List[machine.Machine]): - """Releases the requested number of machines, deleting the instances.""" - if not machine_list: - return - cmd = "gcloud compute instances delete --quiet".split(" ") - names = [str(m) for m in machine_list] - cmd.extend(names) - cmd.append("--zone={zone}".format(zone=self.zone)) - self._run_command(cmd, detach=True) - - def _machines_from_instances( - self, instances: List[Dict[str, Any]]) -> List[machine.Machine]: - """Creates Machine Objects from json data describing created instances.""" - machines = [] - for instance in instances: - name = instance["name"] - external = instance["networkInterfaces"][0]["accessConfigs"][0]["natIP"] - internal = instance["networkInterfaces"][0]["networkIP"] - kwargs = { - "hostname": internal if self.internal else external, - "key_path": self.ssh_key_file, - "username": self.ssh_user, - "key_password": self.ssh_password - } - machines.append(machine.RemoteMachine(name=name, **kwargs)) - return machines - - def _get_unique_names(self, num_names) -> List[str]: - """Returns num_names unique names based on data from the GCP project.""" - return ["machine-" + str(uuid.uuid4()) for _ in range(0, num_names)] - - def _build_instances(self, names: List[str]) -> List[Dict[str, Any]]: - """Creates instances using gcloud command. - - Runs the command `gcloud compute instances create` and returns json data - on created instances on success. Creates len(names) instances, one for each - name. - - Args: - names: list of names of instances to create. - - Returns: - List of json data describing created machines. - """ - if not names: - raise ValueError( - "_build_instances cannot create instances without names.") - cmd = "gcloud compute instances create".split(" ") - cmd.extend(names) - cmd.append("--image=" + self.image) - cmd.append("--zone=" + self.zone) - cmd.append("--machine-type=" + self.machine_type) - res = self._run_command(cmd) - data = res.stdout - data = str(data, "utf-8") if isinstance(data, (bytes, bytearray)) else data - return json.loads(data) - - def _add_ssh_key_to_instances(self, names: List[str]) -> None: - """Adds ssh key to instances by calling gcloud ssh command. - - Runs the command `gcloud compute ssh instance_name` on list of images by - name. Tries to ssh into given instance. - - Args: - names: list of machine names to which to add the ssh-key - self.ssh_key_file. - - Raises: - subprocess.CalledProcessError: when underlying subprocess call returns an - error other than 255 (Connection closed by remote host). - TimeoutError: when 3 unsuccessful tries to ssh into the host return 255. - """ - for name in names: - cmd = "gcloud compute ssh {user}@{name}".format( - user=self.ssh_user, name=name).split(" ") - if self.internal: - cmd.append("--internal-ip") - cmd.append("--ssh-key-file={key}".format(key=self.ssh_key_file)) - cmd.append("--zone={zone}".format(zone=self.zone)) - cmd.append("--command=uname") - timeout = datetime.timedelta(seconds=5 * 60) - start = datetime.datetime.now() - while datetime.datetime.now() <= timeout + start: - try: - self._run_command(cmd) - break - except subprocess.CalledProcessError: - if datetime.datetime.now() > timeout + start: - raise TimeoutError( - "Could not SSH into instance after 5 min: {name}".format( - name=name)) - - def _run_command(self, - cmd: List[str], - detach: bool = False) -> [None, subprocess.CompletedProcess]: - """Runs command as a subprocess. - - Runs command as subprocess and returns the result. - If this has a mock recorder, use the record method to record the subprocess - call. - - Args: - cmd: command to be run as a list of strings. - detach: if True, run the child process and don't wait for it to return. - - Returns: - Completed process object to be parsed by caller or None if detach=True. - - Raises: - CalledProcessError: if subprocess.run returns an error. - """ - cmd = cmd + ["--format=json"] - if detach: - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if self.mock: - out, _ = p.communicate() - self.mock.record( - subprocess.CompletedProcess( - returncode=p.returncode, stdout=out, args=p.args)) - return - - res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if self.mock: - self.mock.record(res) - if res.returncode != 0: - raise subprocess.CalledProcessError( - cmd=" ".join(res.args), - output=res.stdout, - stderr=res.stderr, - returncode=res.returncode) - return res diff --git a/benchmarks/harness/machine_producers/gcloud_producer_test.py b/benchmarks/harness/machine_producers/gcloud_producer_test.py deleted file mode 100644 index c8adb2bdc..000000000 --- a/benchmarks/harness/machine_producers/gcloud_producer_test.py +++ /dev/null @@ -1,48 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests GCloudProducer using mock data. - -GCloudProducer produces machines using 'get_machines' and 'release_machines' -methods. The tests check recorded data (jsonified subprocess.CompletedProcess -objects) of the producer producing one and five machines. -""" -import os -import types - -from benchmarks.harness.machine_producers import machine_producer -from benchmarks.harness.machine_producers import mock_producer - -TEST_DIR = os.path.dirname(__file__) - - -def run_get_release(producer: machine_producer.MachineProducer, - num_machines: int, - validator: types.FunctionType = None): - machines = producer.get_machines(num_machines) - assert len(machines) == num_machines - if validator: - validator(machines=machines, cmd="uname -a", workload=None) - producer.release_machines(machines) - - -def test_run_one(): - mock = mock_producer.MockReader(TEST_DIR + "get_one.json") - producer = mock_producer.MockGCloudProducer(mock) - run_get_release(producer, 1) - - -def test_run_five(): - mock = mock_producer.MockReader(TEST_DIR + "get_five.json") - producer = mock_producer.MockGCloudProducer(mock) - run_get_release(producer, 5) diff --git a/benchmarks/harness/machine_producers/machine_producer.py b/benchmarks/harness/machine_producers/machine_producer.py deleted file mode 100644 index f5591c026..000000000 --- a/benchmarks/harness/machine_producers/machine_producer.py +++ /dev/null @@ -1,51 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Abstract types.""" - -import threading -from typing import List - -from benchmarks.harness import machine - - -class MachineProducer: - """Abstract Machine producer.""" - - def get_machines(self, num_machines: int) -> List[machine.Machine]: - """Returns the requested number of machines.""" - raise NotImplementedError - - def release_machines(self, machine_list: List[machine.Machine]): - """Releases the given set of machines.""" - raise NotImplementedError - - -class LocalMachineProducer(MachineProducer): - """Produces Local Machines.""" - - def __init__(self, limit: int): - self.limit_sem = threading.Semaphore(value=limit) - - def get_machines(self, num_machines: int) -> List[machine.Machine]: - """Returns the request number of MockMachines.""" - - self.limit_sem.acquire() - return [machine.LocalMachine("local") for _ in range(num_machines)] - - def release_machines(self, machine_list: List[machine.MockMachine]): - """No-op.""" - if not machine_list: - raise ValueError("Cannot release an empty list!") - self.limit_sem.release() - machine_list.clear() diff --git a/benchmarks/harness/machine_producers/mock_producer.py b/benchmarks/harness/machine_producers/mock_producer.py deleted file mode 100644 index 37e9cb4b7..000000000 --- a/benchmarks/harness/machine_producers/mock_producer.py +++ /dev/null @@ -1,52 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Producers of mocks.""" - -from typing import List, Any - -from benchmarks.harness import machine -from benchmarks.harness.machine_producers import gcloud_mock_recorder -from benchmarks.harness.machine_producers import gcloud_producer -from benchmarks.harness.machine_producers import machine_producer - - -class MockMachineProducer(machine_producer.MachineProducer): - """Produces MockMachine objects.""" - - def get_machines(self, num_machines: int) -> List[machine.MockMachine]: - """Returns the request number of MockMachines.""" - return [machine.MockMachine() for i in range(num_machines)] - - def release_machines(self, machine_list: List[machine.MockMachine]): - """No-op.""" - return - - -class MockGCloudProducer(gcloud_producer.GCloudProducer): - """Mocks GCloudProducer for testing purposes.""" - - def __init__(self, mock: gcloud_mock_recorder.MockReader, **kwargs): - gcloud_producer.GCloudProducer.__init__( - self, project="mock", ssh_private_key_path="mock", **kwargs) - self.mock = mock - - def _validate_ssh_file(self): - pass - - def _run_command(self, cmd): - return self.mock.pop(cmd) - - def _machines_from_instances( - self, instances: List[Any]) -> List[machine.MockMachine]: - return [machine.MockMachine() for _ in instances] diff --git a/benchmarks/harness/machine_producers/testdata/get_five.json b/benchmarks/harness/machine_producers/testdata/get_five.json deleted file mode 100644 index 32bad1b06..000000000 --- a/benchmarks/harness/machine_producers/testdata/get_five.json +++ /dev/null @@ -1,211 +0,0 @@ -[ - { - "args": [ - "gcloud", - "compute", - "instances", - "list", - "--project", - "project", - "--format=json" - ], - "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":{\"natIP\":\"0.0.0.0\"}]}]}]", - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "instances", - "create", - "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc", - "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92", - "machine-da5859b5-bae6-435d-8005-0202d6f6e065", - "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05", - "machine-1149147d-71e2-43ea-8fe1-49256e5c441c", - "--preemptible", - "--image=ubuntu-1910-eoan-v20191204", - "--zone=us-west1-b", - "--image-project=ubuntu-os-cloud", - "--format=json" - ], - "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]", - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "instances", - "start", - "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc", - "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92", - "machine-da5859b5-bae6-435d-8005-0202d6f6e065", - "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05", - "machine-1149147d-71e2-43ea-8fe1-49256e5c441c", - "--zone=us-west1-b", - "--project=project", - "--format=json" - ], - "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]", - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "", - "returncode": "255" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "", - "returncode": "255" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "", - "returncode": "255" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "", - "returncode": "255" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "", - "returncode": "255" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "Linux\n[]\n", - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "Linux\n[]\n", - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-da5859b5-bae6-435d-8005-0202d6f6e065", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "Linux\n[]\n", - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "Linux\n[]\n", - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-1149147d-71e2-43ea-8fe1-49256e5c441c", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "Linux\n[]\n", - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "instances", - "delete", - "--quiet", - "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc", - "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92", - "machine-da5859b5-bae6-435d-8005-0202d6f6e065", - "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05", - "machine-1149147d-71e2-43ea-8fe1-49256e5c441c", - "--zone=us-west1-b", - "--format=json" - ], - "stdout": "[]\n", - "returncode": "0" - } -] diff --git a/benchmarks/harness/machine_producers/testdata/get_one.json b/benchmarks/harness/machine_producers/testdata/get_one.json deleted file mode 100644 index c359c19c8..000000000 --- a/benchmarks/harness/machine_producers/testdata/get_one.json +++ /dev/null @@ -1,145 +0,0 @@ -[ - { - "args": [ - "gcloud", - "compute", - "instances", - "list", - "--project", - "linux-testing-user", - "--format=json" - ], - "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]", - - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "instances", - "create", - "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc", - "--preemptible", - "--image=ubuntu-1910-eoan-v20191204", - "--zone=us-west1-b", - "--image-project=ubuntu-os-cloud", - "--format=json" - ], - "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]", - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "instances", - "start", - "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc", - "--zone=us-west1-b", - "--project=linux-testing-user", - "--format=json" - ], - "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]", - - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "", - "returncode": "255" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "", - "returncode": "255" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "", - "returncode": "255" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "", - "returncode": "255" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "", - "returncode": "255" - }, - { - "args": [ - "gcloud", - "compute", - "ssh", - "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc", - "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools", - "--zone=us-west1-b", - "--command=uname", - "--format=json" - ], - "stdout": "Linux\n[]\n", - "returncode": "0" - }, - { - "args": [ - "gcloud", - "compute", - "instances", - "delete", - "--quiet", - "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc", - "--zone=us-west1-b", - "--format=json" - ], - "stdout": "[]\n", - "returncode": "0" - } -] diff --git a/benchmarks/harness/machine_producers/yaml_producer.py b/benchmarks/harness/machine_producers/yaml_producer.py deleted file mode 100644 index 5d334e480..000000000 --- a/benchmarks/harness/machine_producers/yaml_producer.py +++ /dev/null @@ -1,106 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Producers based on yaml files.""" - -import os -import threading -from typing import Dict -from typing import List - -import yaml - -from benchmarks.harness import machine -from benchmarks.harness.machine_producers import machine_producer - - -class YamlMachineProducer(machine_producer.MachineProducer): - """Loads machines from a yaml file.""" - - def __init__(self, path: str): - self.machines = build_machines(path) - self.max_machines = len(self.machines) - self.machine_condition = threading.Condition() - - def get_machines(self, num_machines: int) -> List[machine.Machine]: - if num_machines > self.max_machines: - raise ValueError( - "Insufficient Ammount of Machines. {ask} asked for and have {max_num} max." - .format(ask=num_machines, max_num=self.max_machines)) - - with self.machine_condition: - while not self._enough_machines(num_machines): - self.machine_condition.wait(timeout=1) - return [self.machines.pop(0) for _ in range(num_machines)] - - def release_machines(self, machine_list: List[machine.Machine]): - with self.machine_condition: - while machine_list: - next_machine = machine_list.pop() - self.machines.append(next_machine) - self.machine_condition.notify() - - def _enough_machines(self, ask: int): - return ask <= len(self.machines) - - -def build_machines(path: str, num_machines: str = -1) -> List[machine.Machine]: - """Builds machine objects defined by the yaml file "path". - - Args: - path: The path to a yaml file which defines machines. - num_machines: Optional limit on how many machine objects to build. - - Returns: - Machine objects in a list. - - If num_machines is set, len(machines) <= num_machines. - """ - data = parse_yaml(path) - machines = [] - for key, value in data.items(): - if len(machines) == num_machines: - return machines - if isinstance(value, dict): - machines.append(machine.RemoteMachine(key, **value)) - else: - machines.append(machine.LocalMachine(key)) - return machines - - -def parse_yaml(path: str) -> Dict[str, Dict[str, str]]: - """Parse the yaml file pointed by path. - - Args: - path: The path to yaml file. - - Returns: - The contents of the yaml file as a dictionary. - """ - data = get_file_contents(path) - return yaml.load(data, Loader=yaml.Loader) - - -def get_file_contents(path: str) -> str: - """Dumps the file contents to a string and returns them. - - Args: - path: The path to dump. - - Returns: - The file contents as a string. - """ - if not os.path.isabs(path): - path = os.path.abspath(path) - with open(path) as input_file: - return input_file.read() diff --git a/benchmarks/harness/ssh_connection.py b/benchmarks/harness/ssh_connection.py deleted file mode 100644 index b8c8e42d4..000000000 --- a/benchmarks/harness/ssh_connection.py +++ /dev/null @@ -1,126 +0,0 @@ -# python3 -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""SSHConnection handles the details of SSH connections.""" - -import logging -import os -import warnings - -import paramiko - -from benchmarks import harness - -# Get rid of paramiko Cryptography Warnings. -warnings.filterwarnings(action="ignore", module=".*paramiko.*") - -log = logging.getLogger(__name__) - - -def send_one_file(client: paramiko.SSHClient, path: str, - remote_dir: str) -> str: - """Sends a single file via an SSH client. - - Args: - client: The existing SSH client. - path: The local path. - remote_dir: The remote directory. - - Returns: - :return: The remote path as a string. - """ - filename = path.split("/").pop() - if remote_dir != ".": - client.exec_command("mkdir -p " + remote_dir) - with client.open_sftp() as ftp_client: - ftp_client.put(path, os.path.join(remote_dir, filename)) - return os.path.join(remote_dir, filename) - - -class SSHConnection: - """SSH connection to a remote machine.""" - - def __init__(self, name: str, hostname: str, key_path: str, username: str, - **kwargs): - """Sets up a paramiko ssh connection to the given hostname.""" - self._name = name # Unused. - self._hostname = hostname - self._username = username - self._key_path = key_path # RSA Key path - self._kwargs = kwargs - # SSHConnection wraps paramiko. paramiko supports RSA, ECDSA, and Ed25519 - # keys, and we've chosen to only suport and require RSA keys. paramiko - # supports RSA keys that begin with '----BEGIN RSAKEY----'. - # https://stackoverflow.com/questions/53600581/ssh-key-generated-by-ssh-keygen-is-not-recognized-by-paramiko - self.rsa_key = self._rsa() - self.run("true") # Validate. - - def _client(self) -> paramiko.SSHClient: - """Returns a connected SSH client.""" - client = paramiko.SSHClient() - client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - client.connect( - hostname=self._hostname, - port=22, - username=self._username, - pkey=self.rsa_key, - allow_agent=False, - look_for_keys=False) - return client - - def _rsa(self): - if "key_password" in self._kwargs: - password = self._kwargs["key_password"] - else: - password = None - rsa = paramiko.RSAKey.from_private_key_file(self._key_path, password) - return rsa - - def run(self, cmd: str) -> (str, str): - """Runs a command via ssh. - - Args: - cmd: The shell command to run. - - Returns: - The contents of stdout and stderr. - """ - with self._client() as client: - log.info("running command: %s", cmd) - _, stdout, stderr = client.exec_command(command=cmd) - log.info("returned status: %d", stdout.channel.recv_exit_status()) - stdout = stdout.read().decode("utf-8") - stderr = stderr.read().decode("utf-8") - log.info("stdout: %s", stdout) - log.info("stderr: %s", stderr) - return stdout, stderr - - def send_workload(self, name: str) -> str: - """Sends a workload tarball to the remote machine. - - Args: - name: The workload name. - - Returns: - The remote path. - """ - with self._client() as client: - return send_one_file(client, harness.LOCAL_WORKLOADS_PATH.format(name), - harness.REMOTE_WORKLOADS_PATH.format(name)) - - def send_installers(self) -> str: - with self._client() as client: - return send_one_file( - client, - path=harness.INSTALLER_ARCHIVE, - remote_dir=harness.REMOTE_INSTALLERS_PATH) diff --git a/benchmarks/harness/tunnel_dispatcher.py b/benchmarks/harness/tunnel_dispatcher.py deleted file mode 100644 index c56fd022a..000000000 --- a/benchmarks/harness/tunnel_dispatcher.py +++ /dev/null @@ -1,122 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tunnel handles setting up connections to remote machines. - -Tunnel dispatcher is a wrapper around the connection from a local UNIX socket -and a remote UNIX socket via SSH with port forwarding. This is done to -initialize the pythonic dockerpy client to run containers on the remote host by -connecting to /var/run/docker.sock (where Docker is listening). Tunnel -dispatcher sets up the local UNIX socket and calls the `ssh` command as a -subprocess, and holds a reference to that subprocess. It manages clean-up on -exit as best it can by killing the ssh subprocess and deleting the local UNIX -socket,stored in /tmp for easy cleanup in most systems if this fails. - - Typical usage example: - - t = Tunnel(name, **kwargs) - t.connect() - client = t.get_docker_client() # - client.containers.run("ubuntu", "echo hello world") - -""" - -import os -import tempfile -import time - -import docker -import pexpect - -SSH_TUNNEL_COMMAND = """ssh - -o GlobalKnownHostsFile=/dev/null - -o UserKnownHostsFile=/dev/null - -o StrictHostKeyChecking=no - -o IdentitiesOnly=yes - -nNT -L {filename}:/var/run/docker.sock - -i {key_path} - {username}@{hostname}""" - - -class Tunnel(object): - """The tunnel object represents the tunnel via ssh. - - This connects a local unix domain socket with a remote socket. - - Attributes: - _filename: a temporary name of the UNIX socket prefixed by the name - argument. - _hostname: the IP or resolvable hostname of the remote host. - _username: the username of the ssh_key used to run ssh. - _key_path: path to a valid key. - _key_password: optional password to the ssh key in _key_path - _process: holds reference to the ssh subprocess created. - - Returns: - The new minimum port. - - Raises: - ConnectionError: If no available port is found. - """ - - def __init__(self, - name: str, - hostname: str, - username: str, - key_path: str, - key_password: str = "", - **kwargs): - self._filename = tempfile.NamedTemporaryFile(prefix=name).name - self._hostname = hostname - self._username = username - self._key_path = key_path - self._key_password = key_password - self._kwargs = kwargs - self._process = None - - def connect(self): - """Connects the SSH tunnel and stores the subprocess reference in _process.""" - cmd = SSH_TUNNEL_COMMAND.format( - filename=self._filename, - key_path=self._key_path, - username=self._username, - hostname=self._hostname) - self._process = pexpect.spawn(cmd, timeout=10) - - # If given a password, assume we'll be asked for it. - if self._key_password: - self._process.expect(["Enter passphrase for key .*: "]) - self._process.sendline(self._key_password) - - while True: - # Wait for the tunnel to appear. - if self._process.exitstatus is not None: - raise ConnectionError("Error in setting up ssh tunnel") - if os.path.exists(self._filename): - return - time.sleep(0.1) - - def path(self): - """Return the socket file.""" - return self._filename - - def get_docker_client(self): - """Returns a docker client for this Tunnel.""" - return docker.DockerClient(base_url="unix:/" + self._filename) - - def __del__(self): - """Closes the ssh connection process and deletes the socket file.""" - if self._process: - self._process.close() - if os.path.exists(self._filename): - os.remove(self._filename) diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt deleted file mode 100644 index 577eb1a2e..000000000 --- a/benchmarks/requirements.txt +++ /dev/null @@ -1,32 +0,0 @@ -asn1crypto==1.2.0 -atomicwrites==1.3.0 -attrs==19.3.0 -bcrypt==3.1.7 -certifi==2019.9.11 -cffi==1.13.2 -chardet==3.0.4 -Click==7.0 -cryptography==2.8 -docker==3.7.0 -docker-pycreds==0.4.0 -idna==2.8 -importlib-metadata==0.23 -more-itertools==7.2.0 -packaging==19.2 -paramiko==2.6.0 -pathlib2==2.3.5 -pexpect==4.7.0 -pluggy==0.9.0 -ptyprocess==0.6.0 -py==1.8.0 -pycparser==2.19 -PyNaCl==1.3.0 -pyparsing==2.4.5 -pytest==4.3.0 -PyYAML==5.1.2 -requests==2.22.0 -six==1.13.0 -urllib3==1.25.7 -wcwidth==0.1.7 -websocket-client==0.56.0 -zipp==0.6.0 diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD deleted file mode 100644 index 471debfdf..000000000 --- a/benchmarks/runner/BUILD +++ /dev/null @@ -1,56 +0,0 @@ -load("//tools:defs.bzl", "py_library", "py_requirement", "py_test") -load("//benchmarks:defs.bzl", "test_deps") - -package(licenses = ["notice"]) - -py_library( - name = "runner", - srcs = ["__init__.py"], - data = [ - "//benchmarks/workloads:files", - ], - visibility = ["//benchmarks:__pkg__"], - deps = [ - ":commands", - "//benchmarks/harness:benchmark_driver", - "//benchmarks/harness/machine_producers:machine_producer", - "//benchmarks/harness/machine_producers:mock_producer", - "//benchmarks/harness/machine_producers:yaml_producer", - "//benchmarks/suites", - "//benchmarks/suites:absl", - "//benchmarks/suites:density", - "//benchmarks/suites:fio", - "//benchmarks/suites:helpers", - "//benchmarks/suites:http", - "//benchmarks/suites:media", - "//benchmarks/suites:ml", - "//benchmarks/suites:network", - "//benchmarks/suites:redis", - "//benchmarks/suites:startup", - "//benchmarks/suites:sysbench", - "//benchmarks/suites:syscall", - py_requirement("click"), - ], -) - -py_library( - name = "commands", - srcs = ["commands.py"], - deps = [ - py_requirement("click"), - ], -) - -py_test( - name = "runner_test", - srcs = ["runner_test.py"], - python_version = "PY3", - tags = [ - "local", - "manual", - ], - deps = test_deps + [ - ":runner", - py_requirement("click"), - ], -) diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py deleted file mode 100644 index fc59cf505..000000000 --- a/benchmarks/runner/__init__.py +++ /dev/null @@ -1,308 +0,0 @@ -# python3 -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""High-level benchmark utility.""" - -import copy -import csv -import logging -import pkgutil -import pydoc -import re -import subprocess -import sys -import types -from typing import List -from typing import Tuple - -import click - -from benchmarks import harness -from benchmarks import suites -from benchmarks.harness import benchmark_driver -from benchmarks.harness.machine_producers import gcloud_producer -from benchmarks.harness.machine_producers import machine_producer -from benchmarks.harness.machine_producers import mock_producer -from benchmarks.harness.machine_producers import yaml_producer -from benchmarks.runner import commands - - -@click.group() -@click.option( - "--verbose/--no-verbose", default=False, help="Enable verbose logging.") -@click.option("--debug/--no-debug", default=False, help="Enable debug logging.") -def runner(verbose: bool = False, debug: bool = False): - """Run distributed benchmarks. - - See the run and list commands for details. - - Args: - verbose: Enable verbose logging. - debug: Enable debug logging (supercedes verbose). - """ - if debug: - logging.basicConfig(level=logging.DEBUG) - elif verbose: - logging.basicConfig(level=logging.INFO) - - -def find_benchmarks( - regex: str) -> List[Tuple[str, types.ModuleType, types.FunctionType]]: - """Finds all available benchmarks. - - Args: - regex: A regular expression to match. - - Returns: - A (short_name, module, function) tuple for each match. - """ - pkgs = pkgutil.walk_packages(suites.__path__, suites.__name__ + ".") - found = [] - for _, name, _ in pkgs: - mod = pydoc.locate(name) - funcs = [ - getattr(mod, x) - for x in dir(mod) - if suites.is_benchmark(getattr(mod, x)) - ] - for func in funcs: - # Use the short_name with the benchmarks. prefix stripped. - prefix_len = len(suites.__name__ + ".") - short_name = mod.__name__[prefix_len:] + "." + func.__name__ - # Add to the list if a pattern is provided. - if re.compile(regex).match(short_name): - found.append((short_name, mod, func)) - return found - - -@runner.command("list") -@click.argument("method", nargs=-1) -def list_all(method): - """Lists available benchmarks.""" - if not method: - method = ".*" - else: - method = "(" + ",".join(method) + ")" - for (short_name, _, func) in find_benchmarks(method): - print("Benchmark %s:" % short_name) - metrics = suites.benchmark_metrics(func) - if func.__doc__: - print(" " + func.__doc__.lstrip().rstrip()) - if metrics: - print("\n Metrics:") - for metric in metrics: - print("\t{name}: {doc}".format(name=metric[0], doc=metric[1])) - print("\n") - - -@runner.command("run-local", commands.LocalCommand) -@click.pass_context -def run_local(ctx, limit: float, **kwargs): - """Runs benchmarks locally.""" - run(ctx, machine_producer.LocalMachineProducer(limit=limit), **kwargs) - - -@runner.command("run-mock", commands.RunCommand) -@click.pass_context -def run_mock(ctx, **kwargs): - """Runs benchmarks on Mock machines. Used for testing.""" - run(ctx, mock_producer.MockMachineProducer(), **kwargs) - - -@runner.command("run-gcp", commands.GCPCommand) -@click.pass_context -def run_gcp(ctx, image_file: str, zone_file: str, internal: bool, - machine_type: str, installers: List[str], **kwargs): - """Runs all benchmarks on GCP instances.""" - - # Resolve all files. - image = subprocess.check_output([image_file]).rstrip() - zone = subprocess.check_output([zone_file]).rstrip() - key_file = harness.make_key() - - producer = gcloud_producer.GCloudProducer( - image, - zone, - machine_type, - installers, - ssh_key_file=key_file, - ssh_user=harness.DEFAULT_USER, - ssh_password="", - internal=internal) - - try: - run(ctx, producer, **kwargs) - finally: - harness.delete_key() - - -def run(ctx, producer: machine_producer.MachineProducer, method: str, runs: int, - runtime: List[str], metric: List[str], stat: str, **kwargs): - """Runs arbitrary benchmarks. - - All unknown command line flags are passed through to the underlying benchmark - method. Flags may be specified multiple times, in which case it is considered - a "dimension" for the test, and a comma-separated table will be emitted - instead of a single result. - - See the output of list to see available metrics for any given benchmark - method. The method parameter is a regular expression that will match against - available benchmarks. If multiple benchmarks match, then that is considered a - distinct "dimension" for the test. - - All benchmarks are run in parallel where possible, but have exclusive - ownership over the individual machines. - - Every benchmark method will be run the times indicated by --runs. - - Args: - ctx: Click context. - producer: A Machine Producer from which to get Machines. - method: A regular expression for methods to be run. - runs: Number of runs. - runtime: A list of runtimes to test. - metric: A list of metrics to extract. - stat: The class of statistics to extract. - **kwargs: Dimensions to test. - """ - # First, calculate additional arguments. - # - # This essentially calculates any arguments that appear multiple times, and - # moves those to the "dimensions" dictionary, which maps to lists. These - # dimensions are then iterated over to generate the relevant csv output. - dimensions = {} - - if stat not in ["median", "all", "meanstd"]: - raise ValueError("Illegal value for --result, see help.") - - def squish(key: str, value: str): - """Collapse an argument into kwargs or dimensions.""" - if key in dimensions: - # Extend an existing dimension. - dimensions[key].append(value) - elif key in kwargs: - # Create a new dimension. - dimensions[key] = [kwargs[key], value] - del kwargs[key] - else: - # A single value. - kwargs[key] = value - - for item in ctx.args: - if "=" in method: - # This must be the method. The method is simply set to the first - # non-matching argument, which we're also parsing here. - item, method = method, item - if "=" not in item: - logging.error("illegal argument: %s", item) - sys.exit(1) - (key, value) = item.lstrip("-").split("=", 1) - squish(key, value) - - # Convert runtime and metric to dimensions. - # - # They exist only in the arguments above for documentation purposes. - # Essentially here we are treating them like anything else. Note however, - # that an empty set here will result in a dimension. This is important for - # metrics, where an empty set actually means all metrics. - def fold(key: str, value, allow_flatten=False): - """Collapse a list value into kwargs or dimensions.""" - if len(value) == 1 and allow_flatten: - kwargs[key] = value[0] - else: - dimensions[key] = value - - fold("runtime", runtime, allow_flatten=True) - fold("metric", metric) - - # Lookup the methods. - # - # We match the method parameter to a regular expression. This allows you to - # do things like `run --mock .*` for a broad test. Note that we track the - # short_names in the dimensions here, and look up again in the recursion. - methods = { - short_name: func for (short_name, _, func) in find_benchmarks(method) - } - if not methods: - # Must match at least one method. - logging.error("no matching benchmarks for %s: try list.", method) - sys.exit(1) - fold("method", list(methods.keys()), allow_flatten=True) - - # Spin up the drivers. - # - # We ensure that metric is the last entry, because we have special behavior. - # They actually run the test once and the benchmark is a generator that - # produces all viable metrics. - dimension_keys = list(dimensions.keys()) - if "metric" in dimension_keys: - dimension_keys.remove("metric") - dimension_keys.append("metric") - drivers = [] - - def _start(keywords, finished, left): - """Runs a test across dimensions recursively.""" - # Resolve the method fully, it starts as a string. - if "method" in keywords and isinstance(keywords["method"], str): - keywords["method"] = methods[keywords["method"]] - # Is this a non-recursive case? - if not left: - driver = benchmark_driver.BenchmarkDriver(producer, runs=runs, **keywords) - driver.start() - drivers.append((finished, driver)) - else: - # Recurse on the next dimension. - current, left = left[0], left[1:] - keywords = copy.deepcopy(keywords) - if current == "metric": - # We use a generator, popped below. Note that metric is - # guaranteed to be the last element here, and we will provide - # the value for 'done' below when generating the csv. - keywords[current] = dimensions[current] - _start(keywords, finished, left) - else: - # Generate manually. - for value in dimensions[current]: - keywords[current] = value - _start(keywords, finished + [value], left) - - # Start all the drivers, recursively. - _start(kwargs, [], dimension_keys) - - # Finish all tests, write results. - output = csv.writer(sys.stdout) - output.writerow(dimension_keys + ["result"]) - for (done, driver) in drivers: - driver.join() - for (metric_name, result) in getattr(driver, stat)(): - output.writerow([ # Collapse the method name. - hasattr(x, "__name__") and x.__name__ or x for x in done - ] + [metric_name] + result) - - -@runner.command() -@click.argument("env") -@click.option( - "--cmd", default="uname -a", help="command to run on all found machines") -@click.option( - "--workload", default="true", help="workload to run all found machines") -def validate(env, cmd, workload): - """Validates an environment described by yaml file.""" - producer = yaml_producer.YamlMachineProducer(env) - for machine in producer.machines: - print("Machine %s:" % machine) - stdout, _ = machine.run(cmd) - print(" Output of '%s': %s" % (cmd, stdout.lstrip().rstrip())) - image = machine.pull(workload) - stdout = machine.container(image).run() - print(" Container %s: %s" % (workload, stdout.lstrip().rstrip())) diff --git a/benchmarks/runner/commands.py b/benchmarks/runner/commands.py deleted file mode 100644 index 9a391eb01..000000000 --- a/benchmarks/runner/commands.py +++ /dev/null @@ -1,135 +0,0 @@ -# python3 -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Module with the guts of `click` commands. - -Overrides of the click.core.Command. This is done so flags are inherited between -similar commands (the run command). The classes below are meant to be used in -click templates like so. - -@runner.command("run-mock", RunCommand) -def run_mock(**kwargs): - # mock implementation - -""" -import os - -import click - - -class RunCommand(click.core.Command): - """Base Run Command with flags. - - Attributes: - method: regex of which suite to choose (e.g. sysbench would run - sysbench.cpu, sysbench.memory, and sysbench.mutex) See list command for - details. - metric: metric(s) to extract. See list command for details. - runtime: the runtime(s) on which to run. - runs: the number of runs to do of each method. - stat: how to compile results in the case of multiple run (e.g. median). - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - method = click.core.Argument(("method",)) - - metric = click.core.Option(("--metric",), - help="The metric to extract.", - multiple=True) - - runtime = click.core.Option(("--runtime",), - default=["runc"], - help="The runtime to use.", - multiple=True) - runs = click.core.Option(("--runs",), - default=1, - help="The number of times to run each benchmark.") - stat = click.core.Option( - ("--stat",), - default="median", - help="How to aggregate the data from all runs." - "\nmedian - returns the median of all runs (default)" - "\nall - returns all results comma separated" - "\nmeanstd - returns result as mean,std") - self.params.extend([method, runtime, runs, stat, metric]) - self.ignore_unknown_options = True - self.allow_extra_args = True - - -class LocalCommand(RunCommand): - """LocalCommand inherits all flags from RunCommand. - - Attributes: - limit: limits the number of machines on which to run benchmarks. This limits - for local how many benchmarks may run at a time. e.g. "startup" requires - one machine -- passing two machines would limit two startup jobs at a - time. Default is infinity. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.params.append( - click.core.Option( - ("--limit",), - default=1, - help="Limit of number of benchmarks that can run at a given time.")) - - -class GCPCommand(RunCommand): - """GCPCommand inherits all flags from RunCommand and adds flags for run_gcp method. - - Attributes: - image_file: name of the image to build machines from - zone_file: a GCP zone (e.g. us-west1-b) - installers: named installers for post-create - machine_type: type of machine to create (e.g. n1-standard-4) - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - image_file = click.core.Option( - ("--image_file",), - help="The binary that emits the GCP image.", - default=os.path.join( - os.path.dirname(__file__), "../../tools/vm/ubuntu1604"), - ) - zone_file = click.core.Option( - ("--zone_file",), - help="The binary that emits the GCP zone.", - default=os.path.join(os.path.dirname(__file__), "../../tools/vm/zone"), - ) - internal = click.core.Option( - ("--internal/--no-internal",), - help="""Use instance internal IPs. Used if bm-tools runner is running on - GCP instance with firewall rules blocking external IPs.""", - default=False, - ) - installers = click.core.Option( - ("--installers",), - help="The set of installers to use.", - multiple=True, - ) - machine_type = click.core.Option( - ("--machine_type",), - help="Type to make all machines.", - default="n1-standard-4", - ) - self.params.extend([ - image_file, - zone_file, - internal, - machine_type, - installers, - ]) diff --git a/benchmarks/runner/runner_test.py b/benchmarks/runner/runner_test.py deleted file mode 100644 index 7818d631a..000000000 --- a/benchmarks/runner/runner_test.py +++ /dev/null @@ -1,59 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Top-level tests.""" - -import os -import subprocess -import sys - -from click import testing -import pytest - -from benchmarks import runner - - -def _get_locale(): - output = subprocess.check_output(["locale", "-a"]) - locales = output.split() - if b"en_US.utf8" in locales: - return "en_US.UTF-8" - else: - return "C.UTF-8" - - -def _set_locale(): - locale = _get_locale() - if os.getenv("LANG") != locale: - os.environ["LANG"] = locale - os.environ["LC_ALL"] = locale - os.execv("/proc/self/exe", ["python"] + sys.argv) - - -def test_list(): - cli_runner = testing.CliRunner() - result = cli_runner.invoke(runner.runner, ["list"]) - print(result.output) - assert result.exit_code == 0 - - -def test_run(): - cli_runner = testing.CliRunner() - result = cli_runner.invoke(runner.runner, ["run-mock", "."]) - print(result.output) - assert result.exit_code == 0 - - -if __name__ == "__main__": - _set_locale() - sys.exit(pytest.main([__file__])) diff --git a/benchmarks/suites/BUILD b/benchmarks/suites/BUILD deleted file mode 100644 index 04fc23261..000000000 --- a/benchmarks/suites/BUILD +++ /dev/null @@ -1,130 +0,0 @@ -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "suites", - srcs = ["__init__.py"], -) - -py_library( - name = "absl", - srcs = ["absl.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/suites", - "//benchmarks/workloads/absl", - ], -) - -py_library( - name = "density", - srcs = ["density.py"], - deps = [ - "//benchmarks/harness:container", - "//benchmarks/harness:machine", - "//benchmarks/suites", - "//benchmarks/suites:helpers", - ], -) - -py_library( - name = "fio", - srcs = ["fio.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/suites", - "//benchmarks/suites:helpers", - "//benchmarks/workloads/fio", - ], -) - -py_library( - name = "helpers", - srcs = ["helpers.py"], - deps = ["//benchmarks/harness:machine"], -) - -py_library( - name = "http", - srcs = ["http.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/suites", - "//benchmarks/workloads/ab", - ], -) - -py_library( - name = "media", - srcs = ["media.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/suites", - "//benchmarks/suites:helpers", - "//benchmarks/workloads/ffmpeg", - ], -) - -py_library( - name = "ml", - srcs = ["ml.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/suites", - "//benchmarks/suites:startup", - "//benchmarks/workloads/tensorflow", - ], -) - -py_library( - name = "network", - srcs = ["network.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/suites", - "//benchmarks/suites:helpers", - "//benchmarks/workloads/iperf", - ], -) - -py_library( - name = "redis", - srcs = ["redis.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/suites", - "//benchmarks/workloads/redisbenchmark", - ], -) - -py_library( - name = "startup", - srcs = ["startup.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/suites", - "//benchmarks/suites:helpers", - ], -) - -py_library( - name = "sysbench", - srcs = ["sysbench.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/suites", - "//benchmarks/workloads/sysbench", - ], -) - -py_library( - name = "syscall", - srcs = ["syscall.py"], - deps = [ - "//benchmarks/harness:machine", - "//benchmarks/suites", - "//benchmarks/workloads/syscall", - ], -) diff --git a/benchmarks/suites/__init__.py b/benchmarks/suites/__init__.py deleted file mode 100644 index 360736cc3..000000000 --- a/benchmarks/suites/__init__.py +++ /dev/null @@ -1,119 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Core benchmark annotations.""" - -import functools -import inspect -import types -from typing import List -from typing import Tuple - -BENCHMARK_METRICS = '__benchmark_metrics__' -BENCHMARK_MACHINES = '__benchmark_machines__' - - -def is_benchmark(func: types.FunctionType) -> bool: - """Returns true if the given function is a benchmark.""" - return isinstance(func, types.FunctionType) and \ - hasattr(func, BENCHMARK_METRICS) and \ - hasattr(func, BENCHMARK_MACHINES) - - -def benchmark_metrics(func: types.FunctionType) -> List[Tuple[str, str]]: - """Returns the list of available metrics.""" - return [(metric.__name__, metric.__doc__) - for metric in getattr(func, BENCHMARK_METRICS)] - - -def benchmark_machines(func: types.FunctionType) -> int: - """Returns the number of machines required.""" - return getattr(func, BENCHMARK_MACHINES) - - -# pylint: disable=unused-argument -def default(value, **kwargs): - """Returns the passed value.""" - return value - - -def benchmark(metrics: List[types.FunctionType] = None, - machines: int = 1) -> types.FunctionType: - """Define a benchmark function with metrics. - - Args: - metrics: A list of metric functions. - machines: The number of machines required. - - Returns: - A function that accepts the given number of machines, and iteratively - returns a set of (metric_name, metric_value) pairs when called repeatedly. - """ - if not metrics: - # The default passes through. - metrics = [default] - - def decorator(func: types.FunctionType) -> types.FunctionType: - """Decorator function.""" - # Every benchmark should accept at least two parameters: - # runtime: The runtime to use for the benchmark (str, required). - # metrics: The metrics to use, if not the default (str, optional). - @functools.wraps(func) - def wrapper(*args, runtime: str, metric: list = None, **kwargs): - """Wrapper function.""" - # First -- ensure that we marshall all types appropriately. In - # general, we will call this with only strings. These strings will - # need to be converted to their underlying types/classes. - sig = inspect.signature(func) - for param in sig.parameters.values(): - if param.annotation != inspect.Parameter.empty and \ - param.name in kwargs and not isinstance(kwargs[param.name], param.annotation): - try: - # Marshall to the appropriate type. - kwargs[param.name] = param.annotation(kwargs[param.name]) - except Exception as exc: - raise ValueError( - 'illegal type for %s(%s=%s): %s' % - (func.__name__, param.name, kwargs[param.name], exc)) - elif param.default != inspect.Parameter.empty and \ - param.name not in kwargs: - # Ensure that we have the value set, because it will - # be passed to the metric function for evaluation. - kwargs[param.name] = param.default - - # Next, figure out how to apply a metric. We do this prior to - # running the underlying function to prevent having to wait a few - # minutes for a result just to see some error. - if not metric: - # Return all metrics in the iterator. - result = func(*args, runtime=runtime, **kwargs) - for metric_func in metrics: - yield (metric_func.__name__, metric_func(result, **kwargs)) - else: - result = None - for single_metric in metric: - for metric_func in metrics: - # Is this a function that matches the name? - # Apply this function to the result. - if metric_func.__name__ == single_metric: - if not result: - # Lazy evaluation: only if metric matches. - result = func(*args, runtime=runtime, **kwargs) - yield single_metric, metric_func(result, **kwargs) - - # Set metadata on the benchmark (used above). - setattr(wrapper, BENCHMARK_METRICS, metrics) - setattr(wrapper, BENCHMARK_MACHINES, machines) - return wrapper - - return decorator diff --git a/benchmarks/suites/absl.py b/benchmarks/suites/absl.py deleted file mode 100644 index 5d9b57a09..000000000 --- a/benchmarks/suites/absl.py +++ /dev/null @@ -1,37 +0,0 @@ -# python3 -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""absl build benchmark.""" - -from benchmarks import suites -from benchmarks.harness import machine -from benchmarks.workloads import absl - - -@suites.benchmark(metrics=[absl.elapsed_time], machines=1) -def build(target: machine.Machine, **kwargs) -> str: - """Runs the absl workload and report the absl build time. - - Runs the 'bazel build //absl/...' in a clean bazel directory and - monitors time elapsed. - - Args: - target: A machine object. - **kwargs: Additional container options. - - Returns: - Container output. - """ - image = target.pull("absl") - return target.container(image, **kwargs).run() diff --git a/benchmarks/suites/density.py b/benchmarks/suites/density.py deleted file mode 100644 index 89d29fb26..000000000 --- a/benchmarks/suites/density.py +++ /dev/null @@ -1,121 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Density tests.""" - -import re -import types - -from benchmarks import suites -from benchmarks.harness import container -from benchmarks.harness import machine -from benchmarks.suites import helpers - - -# pylint: disable=unused-argument -def memory_usage(value, **kwargs): - """Returns the passed value.""" - return value - - -def density(target: machine.Machine, - workload: str, - count: int = 50, - wait: float = 0, - load_func: types.FunctionType = None, - **kwargs): - """Calculate the average memory usage per container. - - Args: - target: A machine object. - workload: The workload to run. - count: The number of containers to start. - wait: The time to wait after starting. - load_func: Callback that is called after count images have been started on - the given machine. - **kwargs: Additional container options. - - Returns: - The average usage in Kb per container. - """ - count = int(count) - - # Drop all caches. - helpers.drop_caches(target) - before = target.read("/proc/meminfo") - - # Load the workload. - image = target.pull(workload) - - with target.container( - image=image, count=count, **kwargs).detach() as containers: - # Call the optional load function callback if given. - if load_func: - load_func(target, containers) - # Wait 'wait' time before taking a measurement. - target.sleep(wait) - - # Drop caches again. - helpers.drop_caches(target) - after = target.read("/proc/meminfo") - - # Calculate the memory used. - available_re = re.compile(r"MemAvailable:\s*(\d+)\skB\n") - before_available = available_re.findall(before) - after_available = available_re.findall(after) - return 1024 * float(int(before_available[0]) - - int(after_available[0])) / float(count) - - -def load_redis(target: machine.Machine, containers: container.Container): - """Use redis-benchmark "LPUSH" to load each container with 1G of data. - - Args: - target: A machine object. - containers: A set of containers. - """ - target.pull("redisbenchmark") - for name in containers.get_names(): - flags = "-d 10000 -t LPUSH" - target.container( - "redisbenchmark", links={ - name: name - }).run( - host=name, flags=flags) - - -@suites.benchmark(metrics=[memory_usage], machines=1) -def empty(target: machine.Machine, **kwargs) -> float: - """Run trivial containers in a density test.""" - return density(target, workload="sleep", wait=1.0, **kwargs) - - -@suites.benchmark(metrics=[memory_usage], machines=1) -def node(target: machine.Machine, **kwargs) -> float: - """Run node containers in a density test.""" - return density(target, workload="node", wait=3.0, **kwargs) - - -@suites.benchmark(metrics=[memory_usage], machines=1) -def ruby(target: machine.Machine, **kwargs) -> float: - """Run ruby containers in a density test.""" - return density(target, workload="ruby", wait=3.0, **kwargs) - - -@suites.benchmark(metrics=[memory_usage], machines=1) -def redis(target: machine.Machine, **kwargs) -> float: - """Run redis containers in a density test.""" - if "count" not in kwargs: - kwargs["count"] = 5 - return density( - target, workload="redis", wait=3.0, load_func=load_redis, **kwargs) diff --git a/benchmarks/suites/fio.py b/benchmarks/suites/fio.py deleted file mode 100644 index 2171790c5..000000000 --- a/benchmarks/suites/fio.py +++ /dev/null @@ -1,165 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""File I/O tests.""" - -import os - -from benchmarks import suites -from benchmarks.harness import machine -from benchmarks.suites import helpers -from benchmarks.workloads import fio - - -# pylint: disable=too-many-arguments -# pylint: disable=too-many-locals -def run_fio(target: machine.Machine, - test: str, - ioengine: str = "sync", - size: int = 1024 * 1024 * 1024, - iodepth: int = 4, - blocksize: int = 1024 * 1024, - time: int = -1, - mount_dir: str = "", - filename: str = "file.dat", - tmpfs: bool = False, - ramp_time: int = 0, - **kwargs) -> str: - """FIO benchmarks. - - For more on fio see: - https://media.readthedocs.org/pdf/fio/latest/fio.pdf - - Args: - target: A machine object. - test: The test to run (read, write, randread, randwrite, etc.) - ioengine: The engine for I/O. - size: The size of the generated file in bytes (if an integer) or 5g, 16k, - etc. - iodepth: The I/O for certain engines. - blocksize: The blocksize for reads and writes in bytes (if an integer) or - 4k, etc. - time: If test is time based, how long to run in seconds. - mount_dir: The absolute path on the host to mount a bind mount. - filename: The name of the file to creat inside container. For a path of - /dir/dir/file, the script setup a volume like 'docker run -v - mount_dir:/dir/dir fio' and fio will create (and delete) the file - /dir/dir/file. If tmpfs is set, this /dir/dir will be a tmpfs. - tmpfs: If true, mount on tmpfs. - ramp_time: The time to run before recording statistics - **kwargs: Additional container options. - - Returns: - The output of fio as a string. - """ - # Pull the image before dropping caches. - image = target.pull("fio") - - if not mount_dir: - stdout, _ = target.run("pwd") - mount_dir = stdout.rstrip() - - # Setup the volumes. - volumes = {mount_dir: {"bind": "/disk", "mode": "rw"}} if not tmpfs else None - tmpfs = {"/disk": ""} if tmpfs else None - - # Construct a file in the volume. - filepath = os.path.join("/disk", filename) - - # If we are running a read test, us fio to write a file and then flush file - # data from memory. - if "read" in test: - target.container( - image, volumes=volumes, tmpfs=tmpfs, **kwargs).run( - test="write", - ioengine="sync", - size=size, - iodepth=iodepth, - blocksize=blocksize, - path=filepath) - helpers.drop_caches(target) - - # Run the test. - time_str = "--time_base --runtime={time}".format( - time=time) if int(time) > 0 else "" - res = target.container( - image, volumes=volumes, tmpfs=tmpfs, **kwargs).run( - test=test, - ioengine=ioengine, - size=size, - iodepth=iodepth, - blocksize=blocksize, - time=time_str, - path=filepath, - ramp_time=ramp_time) - - target.run( - "rm {path}".format(path=os.path.join(mount_dir.rstrip(), filename))) - - return res - - -@suites.benchmark(metrics=[fio.read_bandwidth, fio.read_io_ops], machines=1) -def read(*args, **kwargs): - """Read test. - - Args: - *args: None. - **kwargs: Additional container options. - - Returns: - The output of fio. - """ - return run_fio(*args, test="read", **kwargs) - - -@suites.benchmark(metrics=[fio.read_bandwidth, fio.read_io_ops], machines=1) -def randread(*args, **kwargs): - """Random read test. - - Args: - *args: None. - **kwargs: Additional container options. - - Returns: - The output of fio. - """ - return run_fio(*args, test="randread", **kwargs) - - -@suites.benchmark(metrics=[fio.write_bandwidth, fio.write_io_ops], machines=1) -def write(*args, **kwargs): - """Write test. - - Args: - *args: None. - **kwargs: Additional container options. - - Returns: - The output of fio. - """ - return run_fio(*args, test="write", **kwargs) - - -@suites.benchmark(metrics=[fio.write_bandwidth, fio.write_io_ops], machines=1) -def randwrite(*args, **kwargs): - """Random write test. - - Args: - *args: None. - **kwargs: Additional container options. - - Returns: - The output of fio. - """ - return run_fio(*args, test="randwrite", **kwargs) diff --git a/benchmarks/suites/helpers.py b/benchmarks/suites/helpers.py deleted file mode 100644 index b3c7360ab..000000000 --- a/benchmarks/suites/helpers.py +++ /dev/null @@ -1,57 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Benchmark helpers.""" - -import datetime -from benchmarks.harness import machine - - -class Timer: - """Helper to time runtime of some call. - - Usage: - - with Timer as t: - # do something. - t.get_time_in_seconds() - """ - - def __init__(self): - self._start = datetime.datetime.now() - - def __enter__(self): - self.start() - return self - - def start(self): - """Starts the timer.""" - self._start = datetime.datetime.now() - - def elapsed(self) -> float: - """Returns the elapsed time in seconds.""" - return (datetime.datetime.now() - self._start).total_seconds() - - def __exit__(self, exception_type, exception_value, exception_traceback): - pass - - -def drop_caches(target: machine.Machine): - """Drops caches on the machine. - - Args: - target: A machine object. - """ - target.run("sudo sync") - target.run("sudo sysctl vm.drop_caches=3") - target.run("sudo sysctl vm.drop_caches=3") diff --git a/benchmarks/suites/http.py b/benchmarks/suites/http.py deleted file mode 100644 index 6efea938c..000000000 --- a/benchmarks/suites/http.py +++ /dev/null @@ -1,138 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""HTTP benchmarks.""" - -from benchmarks import suites -from benchmarks.harness import machine -from benchmarks.workloads import ab - - -# pylint: disable=too-many-arguments -def http(server: machine.Machine, - client: machine.Machine, - workload: str, - requests: int = 5000, - connections: int = 10, - port: int = 80, - path: str = "notfound", - **kwargs) -> str: - """Run apachebench (ab) against an http server. - - Args: - server: A machine object. - client: A machine object. - workload: The http-serving workload. - requests: Number of requests to send the server. Default is 5000. - connections: Number of concurent connections to use. Default is 10. - port: The port to access in benchmarking. - path: File to download, generally workload-specific. - **kwargs: Additional container options. - - Returns: - The full apachebench output. - """ - # Pull the client & server. - apachebench = client.pull("ab") - netcat = client.pull("netcat") - image = server.pull(workload) - - with server.container(image, port=port, **kwargs).detach() as container: - (host, port) = container.address() - # Wait for the server to come up. - client.container(netcat).run(host=host, port=port) - # Run the benchmark, no arguments. - return client.container(apachebench).run( - host=host, - port=port, - requests=requests, - connections=connections, - path=path) - - -# pylint: disable=too-many-arguments -# pylint: disable=too-many-locals -def http_app(server: machine.Machine, - client: machine.Machine, - workload: str, - requests: int = 5000, - connections: int = 10, - port: int = 80, - path: str = "notfound", - **kwargs) -> str: - """Run apachebench (ab) against an http application. - - Args: - server: A machine object. - client: A machine object. - workload: The http-serving workload. - requests: Number of requests to send the server. Default is 5000. - connections: Number of concurent connections to use. Default is 10. - port: The port to use for benchmarking. - path: File to download, generally workload-specific. - **kwargs: Additional container options. - - Returns: - The full apachebench output. - """ - # Pull the client & server. - apachebench = client.pull("ab") - netcat = client.pull("netcat") - server_netcat = server.pull("netcat") - redis = server.pull("redis") - image = server.pull(workload) - redis_port = 6379 - redis_name = "{workload}_redis_server".format(workload=workload) - - with server.container(redis, name=redis_name).detach(): - server.container(server_netcat, links={redis_name: redis_name})\ - .run(host=redis_name, port=redis_port) - with server.container(image, port=port, links={redis_name: redis_name}, **kwargs)\ - .detach(host=redis_name) as container: - (host, port) = container.address() - # Wait for the server to come up. - client.container(netcat).run(host=host, port=port) - # Run the benchmark, no arguments. - return client.container(apachebench).run( - host=host, - port=port, - requests=requests, - connections=connections, - path=path) - - -@suites.benchmark(metrics=[ab.transfer_rate, ab.latency], machines=2) -def httpd(*args, **kwargs) -> str: - """Apache2 benchmark.""" - return http(*args, workload="httpd", port=80, **kwargs) - - -@suites.benchmark( - metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2) -def nginx(*args, **kwargs) -> str: - """Nginx benchmark.""" - return http(*args, workload="nginx", port=80, **kwargs) - - -@suites.benchmark( - metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2) -def node(*args, **kwargs) -> str: - """Node benchmark.""" - return http_app(*args, workload="node_template", path="", port=8080, **kwargs) - - -@suites.benchmark( - metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2) -def ruby(*args, **kwargs) -> str: - """Ruby benchmark.""" - return http_app(*args, workload="ruby_template", path="", port=9292, **kwargs) diff --git a/benchmarks/suites/media.py b/benchmarks/suites/media.py deleted file mode 100644 index 9cbffdaa1..000000000 --- a/benchmarks/suites/media.py +++ /dev/null @@ -1,42 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Media processing benchmarks.""" - -from benchmarks import suites -from benchmarks.harness import machine -from benchmarks.suites import helpers -from benchmarks.workloads import ffmpeg - - -@suites.benchmark(metrics=[ffmpeg.run_time], machines=1) -def transcode(target: machine.Machine, **kwargs) -> float: - """Runs a video transcoding workload and times it. - - Args: - target: A machine object. - **kwargs: Additional container options. - - Returns: - Total workload runtime. - """ - # Load before timing. - image = target.pull("ffmpeg") - - # Drop caches. - helpers.drop_caches(target) - - # Time startup + transcoding. - with helpers.Timer() as timer: - target.container(image, **kwargs).run() - return timer.elapsed() diff --git a/benchmarks/suites/ml.py b/benchmarks/suites/ml.py deleted file mode 100644 index a394d1f69..000000000 --- a/benchmarks/suites/ml.py +++ /dev/null @@ -1,33 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Machine Learning tests.""" - -from benchmarks import suites -from benchmarks.harness import machine -from benchmarks.suites import startup -from benchmarks.workloads import tensorflow - - -@suites.benchmark(metrics=[tensorflow.run_time], machines=1) -def train(target: machine.Machine, **kwargs): - """Run the tensorflow benchmark and return the runtime in seconds of workload. - - Args: - target: A machine object. - **kwargs: Additional container options. - - Returns: - The total runtime. - """ - return startup.startup(target, workload="tensorflow", count=1, **kwargs) diff --git a/benchmarks/suites/network.py b/benchmarks/suites/network.py deleted file mode 100644 index f973cf3f1..000000000 --- a/benchmarks/suites/network.py +++ /dev/null @@ -1,101 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Network microbenchmarks.""" - -from typing import Dict - -from benchmarks import suites -from benchmarks.harness import machine -from benchmarks.suites import helpers -from benchmarks.workloads import iperf - - -def run_iperf(client: machine.Machine, - server: machine.Machine, - client_kwargs: Dict[str, str] = None, - server_kwargs: Dict[str, str] = None) -> str: - """Measure iperf performance. - - Args: - client: A machine object. - server: A machine object. - client_kwargs: Additional client container options. - server_kwargs: Additional server container options. - - Returns: - The output of iperf. - """ - if not client_kwargs: - client_kwargs = dict() - if not server_kwargs: - server_kwargs = dict() - - # Pull images. - netcat = client.pull("netcat") - iperf_client_image = client.pull("iperf") - iperf_server_image = server.pull("iperf") - - # Set this due to a bug in the kernel that resets connections. - client.run("sudo /sbin/sysctl -w net.netfilter.nf_conntrack_tcp_be_liberal=1") - server.run("sudo /sbin/sysctl -w net.netfilter.nf_conntrack_tcp_be_liberal=1") - - with server.container( - iperf_server_image, port=5001, **server_kwargs).detach() as iperf_server: - (host, port) = iperf_server.address() - # Wait until the service is available. - client.container(netcat).run(host=host, port=port) - # Run a warm-up run. - client.container( - iperf_client_image, stderr=True, **client_kwargs).run( - host=host, port=port) - # Run the client with relevant arguments. - res = client.container(iperf_client_image, stderr=True, **client_kwargs)\ - .run(host=host, port=port) - helpers.drop_caches(client) - return res - - -@suites.benchmark(metrics=[iperf.bandwidth], machines=2) -def upload(client: machine.Machine, server: machine.Machine, **kwargs) -> str: - """Measure upload performance. - - Args: - client: A machine object. - server: A machine object. - **kwargs: Client container options. - - Returns: - The output of iperf. - """ - if kwargs["runtime"] == "runc": - kwargs["network_mode"] = "host" - return run_iperf(client, server, client_kwargs=kwargs) - - -@suites.benchmark(metrics=[iperf.bandwidth], machines=2) -def download(client: machine.Machine, server: machine.Machine, **kwargs) -> str: - """Measure download performance. - - Args: - client: A machine object. - server: A machine object. - **kwargs: Server container options. - - Returns: - The output of iperf. - """ - - client_kwargs = {"network_mode": "host"} - return run_iperf( - client, server, client_kwargs=client_kwargs, server_kwargs=kwargs) diff --git a/benchmarks/suites/redis.py b/benchmarks/suites/redis.py deleted file mode 100644 index b84dd073d..000000000 --- a/benchmarks/suites/redis.py +++ /dev/null @@ -1,46 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Redis benchmarks.""" - -from benchmarks import suites -from benchmarks.harness import machine -from benchmarks.workloads import redisbenchmark - - -@suites.benchmark(metrics=list(redisbenchmark.METRICS.values()), machines=2) -def redis(server: machine.Machine, - client: machine.Machine, - flags: str = "", - **kwargs) -> str: - """Run redis-benchmark on client pointing at server machine. - - Args: - server: A machine object. - client: A machine object. - flags: Flags to pass redis-benchmark. - **kwargs: Additional container options. - - Returns: - Output from redis-benchmark. - """ - redis_server = server.pull("redis") - redis_client = client.pull("redisbenchmark") - netcat = client.pull("netcat") - with server.container( - redis_server, port=6379, **kwargs).detach() as container: - (host, port) = container.address() - # Wait for the container to be up. - client.container(netcat).run(host=host, port=port) - # Run all redis benchmarks. - return client.container(redis_client).run(host=host, port=port, flags=flags) diff --git a/benchmarks/suites/startup.py b/benchmarks/suites/startup.py deleted file mode 100644 index a1b6c5753..000000000 --- a/benchmarks/suites/startup.py +++ /dev/null @@ -1,110 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Start-up benchmarks.""" - -from benchmarks import suites -from benchmarks.harness import machine -from benchmarks.suites import helpers - - -# pylint: disable=unused-argument -def startup_time_ms(value, **kwargs): - """Returns average startup time per container in milliseconds. - - Args: - value: The floating point time in seconds. - **kwargs: Ignored. - - Returns: - The time given in milliseconds. - """ - return value * 1000 - - -def startup(target: machine.Machine, - workload: str, - count: int = 5, - port: int = 0, - **kwargs): - """Time the startup of some workload. - - Args: - target: A machine object. - workload: The workload to run. - count: Number of containers to start. - port: The port to check for liveness, if provided. - **kwargs: Additional container options. - - Returns: - The mean start-up time in seconds. - """ - # Load before timing. - image = target.pull(workload) - netcat = target.pull("netcat") - count = int(count) - port = int(port) - - with helpers.Timer() as timer: - for _ in range(count): - if not port: - # Run the container synchronously. - target.container(image, **kwargs).run() - else: - # Run a detached container until httpd available. - with target.container(image, port=port, **kwargs).detach() as server: - (server_host, server_port) = server.address() - target.container(netcat).run(host=server_host, port=server_port) - return timer.elapsed() / float(count) - - -@suites.benchmark(metrics=[startup_time_ms], machines=1) -def empty(target: machine.Machine, **kwargs) -> float: - """Time the startup of a trivial container. - - Args: - target: A machine object. - **kwargs: Additional startup options. - - Returns: - The time to run the container. - """ - return startup(target, workload="true", **kwargs) - - -@suites.benchmark(metrics=[startup_time_ms], machines=1) -def node(target: machine.Machine, **kwargs) -> float: - """Time the startup of the node container. - - Args: - target: A machine object. - **kwargs: Additional statup options. - - Returns: - The time to run the container. - """ - return startup(target, workload="node", port=8080, **kwargs) - - -@suites.benchmark(metrics=[startup_time_ms], machines=1) -def ruby(target: machine.Machine, **kwargs) -> float: - """Time the startup of the ruby container. - - Args: - target: A machine object. - **kwargs: Additional startup options. - - Returns: - The time to run the container. - """ - return startup(target, workload="ruby", port=3000, **kwargs) diff --git a/benchmarks/suites/sysbench.py b/benchmarks/suites/sysbench.py deleted file mode 100644 index 2a6e2126c..000000000 --- a/benchmarks/suites/sysbench.py +++ /dev/null @@ -1,119 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Sysbench-based benchmarks.""" - -from benchmarks import suites -from benchmarks.harness import machine -from benchmarks.workloads import sysbench - - -def run_sysbench(target: machine.Machine, - test: str = "cpu", - threads: int = 8, - time: int = 5, - options: str = "", - **kwargs) -> str: - """Run sysbench container with arguments. - - Args: - target: A machine object. - test: Relevant sysbench test to run (e.g. cpu, memory). - threads: The number of threads to use for tests. - time: The time to run tests. - options: Additional sysbench options. - **kwargs: Additional container options. - - Returns: - The output of the command as a string. - """ - image = target.pull("sysbench") - return target.container(image, **kwargs).run( - test=test, threads=threads, time=time, options=options) - - -@suites.benchmark(metrics=[sysbench.cpu_events_per_second], machines=1) -def cpu(target: machine.Machine, max_prime: int = 5000, **kwargs) -> str: - """Run sysbench CPU test. - - Additional arguments can be provided for sysbench. - - Args: - target: A machine object. - max_prime: The maximum prime number to search. - **kwargs: - - threads: The number of threads to use for tests. - - time: The time to run tests. - - options: Additional sysbench options. See sysbench tool: - https://github.com/akopytov/sysbench - - Returns: - Sysbench output. - """ - options = kwargs.pop("options", "") - options += " --cpu-max-prime={}".format(max_prime) - return run_sysbench(target, test="cpu", options=options, **kwargs) - - -@suites.benchmark(metrics=[sysbench.memory_ops_per_second], machines=1) -def memory(target: machine.Machine, **kwargs) -> str: - """Run sysbench memory test. - - Additional arguments can be provided per sysbench. - - Args: - target: A machine object. - **kwargs: - - threads: The number of threads to use for tests. - - time: The time to run tests. - - options: Additional sysbench options. See sysbench tool: - https://github.com/akopytov/sysbench - - Returns: - Sysbench output. - """ - return run_sysbench(target, test="memory", **kwargs) - - -@suites.benchmark( - metrics=[ - sysbench.mutex_time, sysbench.mutex_latency, sysbench.mutex_deviation - ], - machines=1) -def mutex(target: machine.Machine, - locks: int = 4, - count: int = 10000000, - threads: int = 8, - **kwargs) -> str: - """Run sysbench mutex test. - - Additional arguments can be provided per sysbench. - - Args: - target: A machine object. - locks: The number of locks to use. - count: The number of mutexes. - threads: The number of threads to use for tests. - **kwargs: - - time: The time to run tests. - - options: Additional sysbench options. See sysbench tool: - https://github.com/akopytov/sysbench - - Returns: - Sysbench output. - """ - options = kwargs.pop("options", "") - options += " --mutex-loops=1 --mutex-locks={} --mutex-num={}".format( - count, locks) - return run_sysbench( - target, test="mutex", options=options, threads=threads, **kwargs) diff --git a/benchmarks/suites/syscall.py b/benchmarks/suites/syscall.py deleted file mode 100644 index fa7665b00..000000000 --- a/benchmarks/suites/syscall.py +++ /dev/null @@ -1,37 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Syscall microbenchmark.""" - -from benchmarks import suites -from benchmarks.harness import machine -from benchmarks.workloads.syscall import syscall_time_ns - - -@suites.benchmark(metrics=[syscall_time_ns], machines=1) -def syscall(target: machine.Machine, count: int = 1000000, **kwargs) -> str: - """Runs the syscall workload and report the syscall time. - - Runs the syscall 'SYS_gettimeofday(0,0)' 'count' times and monitors time - elapsed based on the runtime's MONOTONIC clock. - - Args: - target: A machine object. - count: The number of syscalls to execute. - **kwargs: Additional container options. - - Returns: - Container output. - """ - image = target.pull("syscall") - return target.container(image, **kwargs).run(count=count) diff --git a/benchmarks/workloads/BUILD b/benchmarks/workloads/BUILD deleted file mode 100644 index ccb86af5b..000000000 --- a/benchmarks/workloads/BUILD +++ /dev/null @@ -1,35 +0,0 @@ -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "workloads", - srcs = ["__init__.py"], -) - -filegroup( - name = "files", - srcs = [ - "//benchmarks/workloads/ab:tar", - "//benchmarks/workloads/absl:tar", - "//benchmarks/workloads/curl:tar", - "//benchmarks/workloads/ffmpeg:tar", - "//benchmarks/workloads/fio:tar", - "//benchmarks/workloads/httpd:tar", - "//benchmarks/workloads/iperf:tar", - "//benchmarks/workloads/netcat:tar", - "//benchmarks/workloads/nginx:tar", - "//benchmarks/workloads/node:tar", - "//benchmarks/workloads/node_template:tar", - "//benchmarks/workloads/redis:tar", - "//benchmarks/workloads/redisbenchmark:tar", - "//benchmarks/workloads/ruby:tar", - "//benchmarks/workloads/ruby_template:tar", - "//benchmarks/workloads/sleep:tar", - "//benchmarks/workloads/sysbench:tar", - "//benchmarks/workloads/syscall:tar", - "//benchmarks/workloads/tensorflow:tar", - "//benchmarks/workloads/true:tar", - ], -) diff --git a/benchmarks/workloads/__init__.py b/benchmarks/workloads/__init__.py deleted file mode 100644 index e12651e76..000000000 --- a/benchmarks/workloads/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Workloads, parsers and test data.""" diff --git a/benchmarks/workloads/ab/BUILD b/benchmarks/workloads/ab/BUILD deleted file mode 100644 index 945ac7026..000000000 --- a/benchmarks/workloads/ab/BUILD +++ /dev/null @@ -1,28 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test") -load("//benchmarks:defs.bzl", "test_deps") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "ab", - srcs = ["__init__.py"], -) - -py_test( - name = "ab_test", - srcs = ["ab_test.py"], - python_version = "PY3", - deps = test_deps + [ - ":ab", - ], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/ab/Dockerfile b/benchmarks/workloads/ab/Dockerfile deleted file mode 100644 index 0d0b6e2eb..000000000 --- a/benchmarks/workloads/ab/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM ubuntu:18.04 - -RUN set -x \ - && apt-get update \ - && apt-get install -y \ - apache2-utils \ - && rm -rf /var/lib/apt/lists/* - -# Parameterized workload. -ENV requests 5000 -ENV connections 10 -ENV host localhost -ENV port 8080 -ENV path notfound -CMD ["sh", "-c", "ab -n ${requests} -c ${connections} http://${host}:${port}/${path}"] diff --git a/benchmarks/workloads/ab/__init__.py b/benchmarks/workloads/ab/__init__.py deleted file mode 100644 index eedf8e083..000000000 --- a/benchmarks/workloads/ab/__init__.py +++ /dev/null @@ -1,88 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Apachebench tool.""" - -import re - -SAMPLE_DATA = """This is ApacheBench, Version 2.3 <$Revision: 1826891 $> -Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/ -Licensed to The Apache Software Foundation, http://www.apache.org/ - -Benchmarking 10.10.10.10 (be patient).....done - - -Server Software: Apache/2.4.38 -Server Hostname: 10.10.10.10 -Server Port: 80 - -Document Path: /latin10k.txt -Document Length: 210 bytes - -Concurrency Level: 1 -Time taken for tests: 0.180 seconds -Complete requests: 100 -Failed requests: 0 -Non-2xx responses: 100 -Total transferred: 38800 bytes -HTML transferred: 21000 bytes -Requests per second: 556.44 [#/sec] (mean) -Time per request: 1.797 [ms] (mean) -Time per request: 1.797 [ms] (mean, across all concurrent requests) -Transfer rate: 210.84 [Kbytes/sec] received - -Connection Times (ms) - min mean[+/-sd] median max -Connect: 0 0 0.2 0 2 -Processing: 1 2 1.0 1 8 -Waiting: 1 1 1.0 1 7 -Total: 1 2 1.2 1 10 - -Percentage of the requests served within a certain time (ms) - 50% 1 - 66% 2 - 75% 2 - 80% 2 - 90% 2 - 95% 3 - 98% 7 - 99% 10 - 100% 10 (longest request)""" - - -# pylint: disable=unused-argument -def sample(**kwargs) -> str: - return SAMPLE_DATA - - -# pylint: disable=unused-argument -def transfer_rate(data: str, **kwargs) -> float: - """Mean transfer rate in Kbytes/sec.""" - regex = r"Transfer rate:\s+(\d+\.?\d+?)\s+\[Kbytes/sec\]\s+received" - return float(re.compile(regex).search(data).group(1)) - - -# pylint: disable=unused-argument -def latency(data: str, **kwargs) -> float: - """Mean latency in milliseconds.""" - regex = r"Total:\s+\d+\s+(\d+)\s+(\d+\.?\d+?)\s+\d+\s+\d+\s" - res = re.compile(regex).search(data) - return float(res.group(1)) - - -# pylint: disable=unused-argument -def requests_per_second(data: str, **kwargs) -> float: - """Requests per second.""" - regex = r"Requests per second:\s+(\d+\.?\d+?)\s+" - res = re.compile(regex).search(data) - return float(res.group(1)) diff --git a/benchmarks/workloads/ab/ab_test.py b/benchmarks/workloads/ab/ab_test.py deleted file mode 100644 index 4afac2996..000000000 --- a/benchmarks/workloads/ab/ab_test.py +++ /dev/null @@ -1,42 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Parser test.""" - -import sys - -import pytest - -from benchmarks.workloads import ab - - -def test_transfer_rate_parser(): - """Test transfer rate parser.""" - res = ab.transfer_rate(ab.sample()) - assert res == 210.84 - - -def test_latency_parser(): - """Test latency parser.""" - res = ab.latency(ab.sample()) - assert res == 2 - - -def test_requests_per_second(): - """Test requests per second parser.""" - res = ab.requests_per_second(ab.sample()) - assert res == 556.44 - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/absl/BUILD b/benchmarks/workloads/absl/BUILD deleted file mode 100644 index bb1a308bf..000000000 --- a/benchmarks/workloads/absl/BUILD +++ /dev/null @@ -1,28 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test") -load("//benchmarks:defs.bzl", "test_deps") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "absl", - srcs = ["__init__.py"], -) - -py_test( - name = "absl_test", - srcs = ["absl_test.py"], - python_version = "PY3", - deps = test_deps + [ - ":absl", - ], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/absl/Dockerfile b/benchmarks/workloads/absl/Dockerfile deleted file mode 100644 index f29cfa156..000000000 --- a/benchmarks/workloads/absl/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -FROM ubuntu:18.04 - -RUN set -x \ - && apt-get update \ - && apt-get install -y \ - wget \ - git \ - pkg-config \ - zip \ - g++ \ - zlib1g-dev \ - unzip \ - python3 \ - && rm -rf /var/lib/apt/lists/* -RUN wget https://github.com/bazelbuild/bazel/releases/download/0.27.0/bazel-0.27.0-installer-linux-x86_64.sh -RUN chmod +x bazel-0.27.0-installer-linux-x86_64.sh -RUN ./bazel-0.27.0-installer-linux-x86_64.sh - -RUN mkdir abseil-cpp && cd abseil-cpp \ - && git init && git remote add origin https://github.com/abseil/abseil-cpp.git \ - && git fetch --depth 1 origin 43ef2148c0936ebf7cb4be6b19927a9d9d145b8f && git checkout FETCH_HEAD -WORKDIR abseil-cpp -RUN bazel clean -ENV path "absl/base/..." -CMD bazel build ${path} 2>&1 diff --git a/benchmarks/workloads/absl/__init__.py b/benchmarks/workloads/absl/__init__.py deleted file mode 100644 index b40e3f915..000000000 --- a/benchmarks/workloads/absl/__init__.py +++ /dev/null @@ -1,63 +0,0 @@ -# python3 -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""ABSL build benchmark.""" - -import re - -SAMPLE_BAZEL_OUTPUT = """Extracting Bazel installation... -Starting local Bazel server and connecting to it... -Loading: -Loading: 0 packages loaded -Loading: 0 packages loaded - currently loading: absl/algorithm ... (11 packages) -Analyzing: 241 targets (16 packages loaded, 0 targets configured) -Analyzing: 241 targets (21 packages loaded, 617 targets configured) -Analyzing: 241 targets (27 packages loaded, 687 targets configured) -Analyzing: 241 targets (32 packages loaded, 1105 targets configured) -Analyzing: 241 targets (32 packages loaded, 1294 targets configured) -Analyzing: 241 targets (35 packages loaded, 1575 targets configured) -Analyzing: 241 targets (35 packages loaded, 1575 targets configured) -Analyzing: 241 targets (36 packages loaded, 1603 targets configured) -Analyzing: 241 targets (36 packages loaded, 1603 targets configured) -INFO: Analyzed 241 targets (37 packages loaded, 1864 targets configured). -INFO: Found 241 targets... -[0 / 5] [Prepa] BazelWorkspaceStatusAction stable-status.txt -[16 / 50] [Analy] Compiling absl/base/dynamic_annotations.cc ... (20 actions, 10 running) -[60 / 77] Compiling external/com_google_googletest/googletest/src/gtest.cc; 5s processwrapper-sandbox ... (12 actions, 11 running) -[158 / 174] Compiling absl/container/internal/raw_hash_set_test.cc; 2s processwrapper-sandbox ... (12 actions, 11 running) -[278 / 302] Compiling absl/container/internal/raw_hash_set_test.cc; 6s processwrapper-sandbox ... (12 actions, 11 running) -[384 / 406] Compiling absl/container/internal/raw_hash_set_test.cc; 10s processwrapper-sandbox ... (12 actions, 11 running) -[581 / 604] Compiling absl/container/flat_hash_set_test.cc; 11s processwrapper-sandbox ... (12 actions, 11 running) -[722 / 745] Compiling absl/container/node_hash_set_test.cc; 9s processwrapper-sandbox ... (12 actions, 11 running) -[846 / 867] Compiling absl/hash/hash_test.cc; 11s processwrapper-sandbox ... (12 actions, 11 running) -INFO: From Compiling absl/debugging/symbolize_test.cc: -/tmp/cclCVipU.s: Assembler messages: -/tmp/cclCVipU.s:1662: Warning: ignoring changed section attributes for .text -[999 / 1,022] Compiling absl/hash/hash_test.cc; 19s processwrapper-sandbox ... (12 actions, 11 running) -[1,082 / 1,084] Compiling absl/container/flat_hash_map_test.cc; 7s processwrapper-sandbox -INFO: Elapsed time: 81.861s, Critical Path: 23.81s -INFO: 515 processes: 515 processwrapper-sandbox. -INFO: Build completed successfully, 1084 total actions -INFO: Build completed successfully, 1084 total actions""" - - -def sample(): - return SAMPLE_BAZEL_OUTPUT - - -# pylint: disable=unused-argument -def elapsed_time(data: str, **kwargs) -> float: - """Returns the elapsed time for running an absl build.""" - return float(re.compile(r"Elapsed time: (\d*.?\d*)s").search(data).group(1)) diff --git a/benchmarks/workloads/absl/absl_test.py b/benchmarks/workloads/absl/absl_test.py deleted file mode 100644 index 41f216999..000000000 --- a/benchmarks/workloads/absl/absl_test.py +++ /dev/null @@ -1,31 +0,0 @@ -# python3 -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""ABSL build test.""" - -import sys - -import pytest - -from benchmarks.workloads import absl - - -def test_elapsed_time(): - """Test elapsed_time.""" - res = absl.elapsed_time(absl.sample()) - assert res == 81.861 - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/curl/BUILD b/benchmarks/workloads/curl/BUILD deleted file mode 100644 index a70873065..000000000 --- a/benchmarks/workloads/curl/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/curl/Dockerfile b/benchmarks/workloads/curl/Dockerfile deleted file mode 100644 index 336cb088a..000000000 --- a/benchmarks/workloads/curl/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM ubuntu:18.04 - -RUN set -x \ - && apt-get update \ - && apt-get install -y \ - curl \ - && rm -rf /var/lib/apt/lists/* - -# Accept a host and port parameter. -ENV host localhost -ENV port 8080 - -# Spin until we make a successful request. -CMD ["sh", "-c", "while ! curl -v -i http://$host:$port; do true; done"] diff --git a/benchmarks/workloads/ffmpeg/BUILD b/benchmarks/workloads/ffmpeg/BUILD deleted file mode 100644 index 7c41ba631..000000000 --- a/benchmarks/workloads/ffmpeg/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "ffmpeg", - srcs = ["__init__.py"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/ffmpeg/Dockerfile b/benchmarks/workloads/ffmpeg/Dockerfile deleted file mode 100644 index f2f530d7c..000000000 --- a/benchmarks/workloads/ffmpeg/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM ubuntu:18.04 - -RUN set -x \ - && apt-get update \ - && apt-get install -y \ - ffmpeg \ - && rm -rf /var/lib/apt/lists/* -WORKDIR /media -ADD https://samples.ffmpeg.org/MPEG-4/video.mp4 video.mp4 -CMD ["ffmpeg", "-i", "video.mp4", "-c:v", "libx264", "-preset", "veryslow", "output.mp4"] diff --git a/benchmarks/workloads/ffmpeg/__init__.py b/benchmarks/workloads/ffmpeg/__init__.py deleted file mode 100644 index 7578a443b..000000000 --- a/benchmarks/workloads/ffmpeg/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Simple ffmpeg workload.""" - - -# pylint: disable=unused-argument -def run_time(value, **kwargs): - """Returns the startup and runtime of the ffmpeg workload in seconds.""" - return value diff --git a/benchmarks/workloads/fio/BUILD b/benchmarks/workloads/fio/BUILD deleted file mode 100644 index 24d909c53..000000000 --- a/benchmarks/workloads/fio/BUILD +++ /dev/null @@ -1,28 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test") -load("//benchmarks:defs.bzl", "test_deps") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "fio", - srcs = ["__init__.py"], -) - -py_test( - name = "fio_test", - srcs = ["fio_test.py"], - python_version = "PY3", - deps = test_deps + [ - ":fio", - ], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/fio/Dockerfile b/benchmarks/workloads/fio/Dockerfile deleted file mode 100644 index b3cf864eb..000000000 --- a/benchmarks/workloads/fio/Dockerfile +++ /dev/null @@ -1,23 +0,0 @@ -FROM ubuntu:18.04 - -RUN set -x \ - && apt-get update \ - && apt-get install -y \ - fio \ - && rm -rf /var/lib/apt/lists/* - -# Parameterized test. -ENV test write -ENV ioengine sync -ENV size 5000000 -ENV iodepth 4 -ENV blocksize "1m" -ENV time "" -ENV path "/disk/file.dat" -ENV ramp_time 0 - -CMD ["sh", "-c", "fio --output-format=json --name=test --ramp_time=${ramp_time} --ioengine=${ioengine} --size=${size} \ ---filename=${path} --iodepth=${iodepth} --bs=${blocksize} --rw=${test} ${time}"] - - - diff --git a/benchmarks/workloads/fio/__init__.py b/benchmarks/workloads/fio/__init__.py deleted file mode 100644 index 52711e956..000000000 --- a/benchmarks/workloads/fio/__init__.py +++ /dev/null @@ -1,369 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""FIO benchmark tool.""" - -import json - -SAMPLE_DATA = """ -{ - "fio version" : "fio-3.1", - "timestamp" : 1554837456, - "timestamp_ms" : 1554837456621, - "time" : "Tue Apr 9 19:17:36 2019", - "jobs" : [ - { - "jobname" : "test", - "groupid" : 0, - "error" : 0, - "eta" : 2147483647, - "elapsed" : 1, - "job options" : { - "name" : "test", - "ioengine" : "sync", - "size" : "1073741824", - "filename" : "/disk/file.dat", - "iodepth" : "4", - "bs" : "4096", - "rw" : "write" - }, - "read" : { - "io_bytes" : 0, - "io_kbytes" : 0, - "bw" : 0, - "iops" : 0.000000, - "runtime" : 0, - "total_ios" : 0, - "short_ios" : 0, - "drop_ios" : 0, - "slat_ns" : { - "min" : 0, - "max" : 0, - "mean" : 0.000000, - "stddev" : 0.000000 - }, - "clat_ns" : { - "min" : 0, - "max" : 0, - "mean" : 0.000000, - "stddev" : 0.000000, - "percentile" : { - "1.000000" : 0, - "5.000000" : 0, - "10.000000" : 0, - "20.000000" : 0, - "30.000000" : 0, - "40.000000" : 0, - "50.000000" : 0, - "60.000000" : 0, - "70.000000" : 0, - "80.000000" : 0, - "90.000000" : 0, - "95.000000" : 0, - "99.000000" : 0, - "99.500000" : 0, - "99.900000" : 0, - "99.950000" : 0, - "99.990000" : 0, - "0.00" : 0, - "0.00" : 0, - "0.00" : 0 - } - }, - "lat_ns" : { - "min" : 0, - "max" : 0, - "mean" : 0.000000, - "stddev" : 0.000000 - }, - "bw_min" : 0, - "bw_max" : 0, - "bw_agg" : 0.000000, - "bw_mean" : 0.000000, - "bw_dev" : 0.000000, - "bw_samples" : 0, - "iops_min" : 0, - "iops_max" : 0, - "iops_mean" : 0.000000, - "iops_stddev" : 0.000000, - "iops_samples" : 0 - }, - "write" : { - "io_bytes" : 1073741824, - "io_kbytes" : 1048576, - "bw" : 1753471, - "iops" : 438367.892977, - "runtime" : 598, - "total_ios" : 262144, - "short_ios" : 0, - "drop_ios" : 0, - "slat_ns" : { - "min" : 0, - "max" : 0, - "mean" : 0.000000, - "stddev" : 0.000000 - }, - "clat_ns" : { - "min" : 1693, - "max" : 754733, - "mean" : 2076.404373, - "stddev" : 1724.195529, - "percentile" : { - "1.000000" : 1736, - "5.000000" : 1752, - "10.000000" : 1768, - "20.000000" : 1784, - "30.000000" : 1800, - "40.000000" : 1800, - "50.000000" : 1816, - "60.000000" : 1816, - "70.000000" : 1848, - "80.000000" : 1928, - "90.000000" : 2512, - "95.000000" : 2992, - "99.000000" : 6176, - "99.500000" : 6304, - "99.900000" : 11328, - "99.950000" : 15168, - "99.990000" : 17792, - "0.00" : 0, - "0.00" : 0, - "0.00" : 0 - } - }, - "lat_ns" : { - "min" : 1731, - "max" : 754770, - "mean" : 2117.878979, - "stddev" : 1730.290512 - }, - "bw_min" : 1731120, - "bw_max" : 1731120, - "bw_agg" : 98.725328, - "bw_mean" : 1731120.000000, - "bw_dev" : 0.000000, - "bw_samples" : 1, - "iops_min" : 432780, - "iops_max" : 432780, - "iops_mean" : 432780.000000, - "iops_stddev" : 0.000000, - "iops_samples" : 1 - }, - "trim" : { - "io_bytes" : 0, - "io_kbytes" : 0, - "bw" : 0, - "iops" : 0.000000, - "runtime" : 0, - "total_ios" : 0, - "short_ios" : 0, - "drop_ios" : 0, - "slat_ns" : { - "min" : 0, - "max" : 0, - "mean" : 0.000000, - "stddev" : 0.000000 - }, - "clat_ns" : { - "min" : 0, - "max" : 0, - "mean" : 0.000000, - "stddev" : 0.000000, - "percentile" : { - "1.000000" : 0, - "5.000000" : 0, - "10.000000" : 0, - "20.000000" : 0, - "30.000000" : 0, - "40.000000" : 0, - "50.000000" : 0, - "60.000000" : 0, - "70.000000" : 0, - "80.000000" : 0, - "90.000000" : 0, - "95.000000" : 0, - "99.000000" : 0, - "99.500000" : 0, - "99.900000" : 0, - "99.950000" : 0, - "99.990000" : 0, - "0.00" : 0, - "0.00" : 0, - "0.00" : 0 - } - }, - "lat_ns" : { - "min" : 0, - "max" : 0, - "mean" : 0.000000, - "stddev" : 0.000000 - }, - "bw_min" : 0, - "bw_max" : 0, - "bw_agg" : 0.000000, - "bw_mean" : 0.000000, - "bw_dev" : 0.000000, - "bw_samples" : 0, - "iops_min" : 0, - "iops_max" : 0, - "iops_mean" : 0.000000, - "iops_stddev" : 0.000000, - "iops_samples" : 0 - }, - "usr_cpu" : 17.922948, - "sys_cpu" : 81.574539, - "ctx" : 3, - "majf" : 0, - "minf" : 10, - "iodepth_level" : { - "1" : 100.000000, - "2" : 0.000000, - "4" : 0.000000, - "8" : 0.000000, - "16" : 0.000000, - "32" : 0.000000, - ">=64" : 0.000000 - }, - "latency_ns" : { - "2" : 0.000000, - "4" : 0.000000, - "10" : 0.000000, - "20" : 0.000000, - "50" : 0.000000, - "100" : 0.000000, - "250" : 0.000000, - "500" : 0.000000, - "750" : 0.000000, - "1000" : 0.000000 - }, - "latency_us" : { - "2" : 82.737350, - "4" : 12.605286, - "10" : 4.543686, - "20" : 0.107956, - "50" : 0.010000, - "100" : 0.000000, - "250" : 0.000000, - "500" : 0.000000, - "750" : 0.000000, - "1000" : 0.010000 - }, - "latency_ms" : { - "2" : 0.000000, - "4" : 0.000000, - "10" : 0.000000, - "20" : 0.000000, - "50" : 0.000000, - "100" : 0.000000, - "250" : 0.000000, - "500" : 0.000000, - "750" : 0.000000, - "1000" : 0.000000, - "2000" : 0.000000, - ">=2000" : 0.000000 - }, - "latency_depth" : 4, - "latency_target" : 0, - "latency_percentile" : 100.000000, - "latency_window" : 0 - } - ], - "disk_util" : [ - { - "name" : "dm-1", - "read_ios" : 0, - "write_ios" : 3, - "read_merges" : 0, - "write_merges" : 0, - "read_ticks" : 0, - "write_ticks" : 0, - "in_queue" : 0, - "util" : 0.000000, - "aggr_read_ios" : 0, - "aggr_write_ios" : 3, - "aggr_read_merges" : 0, - "aggr_write_merge" : 0, - "aggr_read_ticks" : 0, - "aggr_write_ticks" : 0, - "aggr_in_queue" : 0, - "aggr_util" : 0.000000 - }, - { - "name" : "dm-0", - "read_ios" : 0, - "write_ios" : 3, - "read_merges" : 0, - "write_merges" : 0, - "read_ticks" : 0, - "write_ticks" : 0, - "in_queue" : 0, - "util" : 0.000000, - "aggr_read_ios" : 0, - "aggr_write_ios" : 3, - "aggr_read_merges" : 0, - "aggr_write_merge" : 0, - "aggr_read_ticks" : 0, - "aggr_write_ticks" : 2, - "aggr_in_queue" : 0, - "aggr_util" : 0.000000 - }, - { - "name" : "nvme0n1", - "read_ios" : 0, - "write_ios" : 3, - "read_merges" : 0, - "write_merges" : 0, - "read_ticks" : 0, - "write_ticks" : 2, - "in_queue" : 0, - "util" : 0.000000 - } - ] -} -""" - - -# pylint: disable=unused-argument -def sample(**kwargs) -> str: - return SAMPLE_DATA - - -# pylint: disable=unused-argument -def read_bandwidth(data: str, **kwargs) -> int: - """File I/O bandwidth.""" - return json.loads(data)["jobs"][0]["read"]["bw"] * 1024 - - -# pylint: disable=unused-argument -def write_bandwidth(data: str, **kwargs) -> int: - """File I/O bandwidth.""" - return json.loads(data)["jobs"][0]["write"]["bw"] * 1024 - - -# pylint: disable=unused-argument -def read_io_ops(data: str, **kwargs) -> float: - """File I/O operations per second.""" - return float(json.loads(data)["jobs"][0]["read"]["iops"]) - - -# pylint: disable=unused-argument -def write_io_ops(data: str, **kwargs) -> float: - """File I/O operations per second.""" - return float(json.loads(data)["jobs"][0]["write"]["iops"]) - - -# Change function names so we just print "bandwidth" and "io_ops". -read_bandwidth.__name__ = "bandwidth" -write_bandwidth.__name__ = "bandwidth" -read_io_ops.__name__ = "io_ops" -write_io_ops.__name__ = "io_ops" diff --git a/benchmarks/workloads/fio/fio_test.py b/benchmarks/workloads/fio/fio_test.py deleted file mode 100644 index 04a6eeb7e..000000000 --- a/benchmarks/workloads/fio/fio_test.py +++ /dev/null @@ -1,44 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Parser tests.""" - -import sys - -import pytest - -from benchmarks.workloads import fio - - -def test_read_io_ops(): - """Test read ops parser.""" - assert fio.read_io_ops(fio.sample()) == 0.0 - - -def test_write_io_ops(): - """Test write ops parser.""" - assert fio.write_io_ops(fio.sample()) == 438367.892977 - - -def test_read_bandwidth(): - """Test read bandwidth parser.""" - assert fio.read_bandwidth(fio.sample()) == 0.0 - - -def test_write_bandwith(): - """Test write bandwidth parser.""" - assert fio.write_bandwidth(fio.sample()) == 1753471 * 1024 - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/httpd/BUILD b/benchmarks/workloads/httpd/BUILD deleted file mode 100644 index 83450d190..000000000 --- a/benchmarks/workloads/httpd/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - "apache2-tmpdir.conf", - ], -) diff --git a/benchmarks/workloads/httpd/Dockerfile b/benchmarks/workloads/httpd/Dockerfile deleted file mode 100644 index 52a550678..000000000 --- a/benchmarks/workloads/httpd/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM ubuntu:18.04 - -RUN set -x \ - && apt-get update \ - && apt-get install -y \ - apache2 \ - && rm -rf /var/lib/apt/lists/* - -# Generate a bunch of relevant files. -RUN mkdir -p /local && \ - for size in 1 10 100 1000 1024 10240; do \ - dd if=/dev/zero of=/local/latin${size}k.txt count=${size} bs=1024; \ - done - -# Rewrite DocumentRoot to point to /tmp/html instead of the default path. -RUN sed -i 's/DocumentRoot.*\/var\/www\/html$/DocumentRoot \/tmp\/html/' /etc/apache2/sites-enabled/000-default.conf -COPY ./apache2-tmpdir.conf /etc/apache2/sites-enabled/apache2-tmpdir.conf - -# Standard settings. -ENV APACHE_RUN_DIR /tmp -ENV APACHE_RUN_USER nobody -ENV APACHE_RUN_GROUP nogroup -ENV APACHE_LOG_DIR /tmp -ENV APACHE_PID_FILE /tmp/apache.pid - -# Copy on start-up; serve everything from /tmp (including the configuration). -CMD ["sh", "-c", "mkdir -p /tmp/html && cp -a /local/* /tmp/html && apache2 -X"] diff --git a/benchmarks/workloads/httpd/apache2-tmpdir.conf b/benchmarks/workloads/httpd/apache2-tmpdir.conf deleted file mode 100644 index e33f8d9bb..000000000 --- a/benchmarks/workloads/httpd/apache2-tmpdir.conf +++ /dev/null @@ -1,5 +0,0 @@ -<Directory /tmp/html/> - Options Indexes FollowSymLinks - AllowOverride None - Require all granted -</Directory>
\ No newline at end of file diff --git a/benchmarks/workloads/iperf/BUILD b/benchmarks/workloads/iperf/BUILD deleted file mode 100644 index 91b953718..000000000 --- a/benchmarks/workloads/iperf/BUILD +++ /dev/null @@ -1,28 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test") -load("//benchmarks:defs.bzl", "test_deps") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "iperf", - srcs = ["__init__.py"], -) - -py_test( - name = "iperf_test", - srcs = ["iperf_test.py"], - python_version = "PY3", - deps = test_deps + [ - ":iperf", - ], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/iperf/Dockerfile b/benchmarks/workloads/iperf/Dockerfile deleted file mode 100644 index 9704c506c..000000000 --- a/benchmarks/workloads/iperf/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM ubuntu:18.04 - -RUN set -x \ - && apt-get update \ - && apt-get install -y \ - iperf \ - && rm -rf /var/lib/apt/lists/* - -# Accept a host parameter. -ENV host "" -ENV port 5001 - -# Start as client if the host is provided. -CMD ["sh", "-c", "test -z \"${host}\" && iperf -s || iperf -f K --realtime -c ${host} -p ${port}"] diff --git a/benchmarks/workloads/iperf/__init__.py b/benchmarks/workloads/iperf/__init__.py deleted file mode 100644 index 3817a7ade..000000000 --- a/benchmarks/workloads/iperf/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""iperf.""" - -import re - -SAMPLE_DATA = """ ------------------------------------------------------------- -Client connecting to 10.138.15.215, TCP port 32779 -TCP window size: 45.0 KByte (default) ------------------------------------------------------------- -[ 3] local 10.138.15.216 port 32866 connected with 10.138.15.215 port 32779 -[ ID] Interval Transfer Bandwidth -[ 3] 0.0-10.0 sec 459520 KBytes 45900 KBytes/sec - -""" - - -# pylint: disable=unused-argument -def sample(**kwargs) -> str: - return SAMPLE_DATA - - -# pylint: disable=unused-argument -def bandwidth(data: str, **kwargs) -> float: - """Calculate the bandwidth.""" - regex = r"\[\s*\d+\][^\n]+\s+(\d+\.?\d*)\s+KBytes/sec" - res = re.compile(regex).search(data) - return float(res.group(1)) * 1000 diff --git a/benchmarks/workloads/iperf/iperf_test.py b/benchmarks/workloads/iperf/iperf_test.py deleted file mode 100644 index 6959b7e8a..000000000 --- a/benchmarks/workloads/iperf/iperf_test.py +++ /dev/null @@ -1,28 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests for iperf.""" - -import sys - -import pytest - -from benchmarks.workloads import iperf - - -def test_bandwidth(): - assert iperf.bandwidth(iperf.sample()) == 45900 * 1000 - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/netcat/BUILD b/benchmarks/workloads/netcat/BUILD deleted file mode 100644 index a70873065..000000000 --- a/benchmarks/workloads/netcat/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/netcat/Dockerfile b/benchmarks/workloads/netcat/Dockerfile deleted file mode 100644 index d8548d89a..000000000 --- a/benchmarks/workloads/netcat/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM ubuntu:18.04 - -RUN set -x \ - && apt-get update \ - && apt-get install -y \ - netcat \ - && rm -rf /var/lib/apt/lists/* - -# Accept a host and port parameter. -ENV host localhost -ENV port 8080 - -# Spin until we make a successful request. -CMD ["sh", "-c", "while ! nc -zv $host $port; do true; done"] diff --git a/benchmarks/workloads/nginx/BUILD b/benchmarks/workloads/nginx/BUILD deleted file mode 100644 index a70873065..000000000 --- a/benchmarks/workloads/nginx/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/node/BUILD b/benchmarks/workloads/node/BUILD deleted file mode 100644 index bfcf78cf9..000000000 --- a/benchmarks/workloads/node/BUILD +++ /dev/null @@ -1,15 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - "index.js", - "package.json", - ], -) diff --git a/benchmarks/workloads/node/Dockerfile b/benchmarks/workloads/node/Dockerfile deleted file mode 100644 index 139a38bf5..000000000 --- a/benchmarks/workloads/node/Dockerfile +++ /dev/null @@ -1,2 +0,0 @@ -FROM node:onbuild -CMD ["node", "index.js"] diff --git a/benchmarks/workloads/node/index.js b/benchmarks/workloads/node/index.js deleted file mode 100644 index 584158462..000000000 --- a/benchmarks/workloads/node/index.js +++ /dev/null @@ -1,28 +0,0 @@ -'use strict'; - -var start = new Date().getTime(); - -// Load dependencies to simulate an average nodejs app. -var req_0 = require('async'); -var req_1 = require('bluebird'); -var req_2 = require('firebase'); -var req_3 = require('firebase-admin'); -var req_4 = require('@google-cloud/container'); -var req_5 = require('@google-cloud/logging'); -var req_6 = require('@google-cloud/monitoring'); -var req_7 = require('@google-cloud/spanner'); -var req_8 = require('lodash'); -var req_9 = require('mailgun-js'); -var req_10 = require('request'); -var express = require('express'); -var app = express(); - -var loaded = new Date().getTime() - start; -app.get('/', function(req, res) { - res.send('Hello World!<br>Loaded in ' + loaded + 'ms'); -}); - -console.log('Loaded in ' + loaded + ' ms'); -app.listen(8080, function() { - console.log('Listening on port 8080...'); -}); diff --git a/benchmarks/workloads/node/package.json b/benchmarks/workloads/node/package.json deleted file mode 100644 index c00b9b3cb..000000000 --- a/benchmarks/workloads/node/package.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "node", - "version": "1.0.0", - "main": "index.js", - "dependencies": { - "@google-cloud/container": "^0.3.0", - "@google-cloud/logging": "^4.2.0", - "@google-cloud/monitoring": "^0.6.0", - "@google-cloud/spanner": "^2.2.1", - "async": "^2.6.1", - "bluebird": "^3.5.3", - "express": "^4.16.4", - "firebase": "^5.7.2", - "firebase-admin": "^6.4.0", - "lodash": "^4.17.11", - "mailgun-js": "^0.22.0", - "request": "^2.88.0" - } -} diff --git a/benchmarks/workloads/node_template/BUILD b/benchmarks/workloads/node_template/BUILD deleted file mode 100644 index e142f082a..000000000 --- a/benchmarks/workloads/node_template/BUILD +++ /dev/null @@ -1,17 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - "index.hbs", - "index.js", - "package.json", - "package-lock.json", - ], -) diff --git a/benchmarks/workloads/node_template/Dockerfile b/benchmarks/workloads/node_template/Dockerfile deleted file mode 100644 index 7eb065d54..000000000 --- a/benchmarks/workloads/node_template/Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -FROM node:onbuild - -ENV host "127.0.0.1" - -CMD ["sh", "-c", "node index.js ${host}"] diff --git a/benchmarks/workloads/node_template/index.hbs b/benchmarks/workloads/node_template/index.hbs deleted file mode 100644 index 03feceb75..000000000 --- a/benchmarks/workloads/node_template/index.hbs +++ /dev/null @@ -1,8 +0,0 @@ -<!DOCTYPE html> -<html> -<body> - {{#each text}} - <p>{{this}}</p> - {{/each}} -</body> -</html> diff --git a/benchmarks/workloads/node_template/index.js b/benchmarks/workloads/node_template/index.js deleted file mode 100644 index 04a27f356..000000000 --- a/benchmarks/workloads/node_template/index.js +++ /dev/null @@ -1,43 +0,0 @@ -const app = require('express')(); -const path = require('path'); -const redis = require('redis'); -const srs = require('secure-random-string'); - -// The hostname is the first argument. -const host_name = process.argv[2]; - -var client = redis.createClient({host: host_name, detect_buffers: true}); - -app.set('views', __dirname); -app.set('view engine', 'hbs'); - -app.get('/', (req, res) => { - var tmp = []; - /* Pull four random keys from the redis server. */ - for (i = 0; i < 4; i++) { - client.get(Math.floor(Math.random() * (100)), function(err, reply) { - tmp.push(reply.toString()); - }); - } - - res.render('index', {text: tmp}); -}); - -/** - * Securely generate a random string. - * @param {number} len - * @return {string} - */ -function randomBody(len) { - return srs({alphanumeric: true, length: len}); -} - -/** Mutates one hundred keys randomly. */ -function generateText() { - for (i = 0; i < 100; i++) { - client.set(i, randomBody(1024)); - } -} - -generateText(); -app.listen(8080); diff --git a/benchmarks/workloads/node_template/package-lock.json b/benchmarks/workloads/node_template/package-lock.json deleted file mode 100644 index 580e68aa5..000000000 --- a/benchmarks/workloads/node_template/package-lock.json +++ /dev/null @@ -1,486 +0,0 @@ -{ - "name": "nodedum", - "version": "1.0.0", - "lockfileVersion": 1, - "requires": true, - "dependencies": { - "accepts": { - "version": "1.3.5", - "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz", - "integrity": "sha1-63d99gEXI6OxTopywIBcjoZ0a9I=", - "requires": { - "mime-types": "~2.1.18", - "negotiator": "0.6.1" - } - }, - "array-flatten": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", - "integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI=" - }, - "async": { - "version": "2.6.2", - "resolved": "https://registry.npmjs.org/async/-/async-2.6.2.tgz", - "integrity": "sha512-H1qVYh1MYhEEFLsP97cVKqCGo7KfCyTt6uEWqsTBr9SO84oK9Uwbyd/yCW+6rKJLHksBNUVWZDAjfS+Ccx0Bbg==", - "requires": { - "lodash": "^4.17.11" - } - }, - "body-parser": { - "version": "1.18.3", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.18.3.tgz", - "integrity": "sha1-WykhmP/dVTs6DyDe0FkrlWlVyLQ=", - "requires": { - "bytes": "3.0.0", - "content-type": "~1.0.4", - "debug": "2.6.9", - "depd": "~1.1.2", - "http-errors": "~1.6.3", - "iconv-lite": "0.4.23", - "on-finished": "~2.3.0", - "qs": "6.5.2", - "raw-body": "2.3.3", - "type-is": "~1.6.16" - } - }, - "bytes": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz", - "integrity": "sha1-0ygVQE1olpn4Wk6k+odV3ROpYEg=" - }, - "commander": { - "version": "2.20.0", - "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.0.tgz", - "integrity": "sha512-7j2y+40w61zy6YC2iRNpUe/NwhNyoXrYpHMrSunaMG64nRnaf96zO/KMQR4OyN/UnE5KLyEBnKHd4aG3rskjpQ==", - "optional": true - }, - "content-disposition": { - "version": "0.5.2", - "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.2.tgz", - "integrity": "sha1-DPaLud318r55YcOoUXjLhdunjLQ=" - }, - "content-type": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz", - "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==" - }, - "cookie": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.3.1.tgz", - "integrity": "sha1-5+Ch+e9DtMi6klxcWpboBtFoc7s=" - }, - "cookie-signature": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz", - "integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw=" - }, - "debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", - "requires": { - "ms": "2.0.0" - } - }, - "depd": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", - "integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak=" - }, - "destroy": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz", - "integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA=" - }, - "double-ended-queue": { - "version": "2.1.0-0", - "resolved": "https://registry.npmjs.org/double-ended-queue/-/double-ended-queue-2.1.0-0.tgz", - "integrity": "sha1-ED01J/0xUo9AGIEwyEHv3XgmTlw=" - }, - "ee-first": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", - "integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0=" - }, - "encodeurl": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", - "integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k=" - }, - "escape-html": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", - "integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg=" - }, - "etag": { - "version": "1.8.1", - "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", - "integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc=" - }, - "express": { - "version": "4.16.4", - "resolved": "https://registry.npmjs.org/express/-/express-4.16.4.tgz", - "integrity": "sha512-j12Uuyb4FMrd/qQAm6uCHAkPtO8FDTRJZBDd5D2KOL2eLaz1yUNdUB/NOIyq0iU4q4cFarsUCrnFDPBcnksuOg==", - "requires": { - "accepts": "~1.3.5", - "array-flatten": "1.1.1", - "body-parser": "1.18.3", - "content-disposition": "0.5.2", - "content-type": "~1.0.4", - "cookie": "0.3.1", - "cookie-signature": "1.0.6", - "debug": "2.6.9", - "depd": "~1.1.2", - "encodeurl": "~1.0.2", - "escape-html": "~1.0.3", - "etag": "~1.8.1", - "finalhandler": "1.1.1", - "fresh": "0.5.2", - "merge-descriptors": "1.0.1", - "methods": "~1.1.2", - "on-finished": "~2.3.0", - "parseurl": "~1.3.2", - "path-to-regexp": "0.1.7", - "proxy-addr": "~2.0.4", - "qs": "6.5.2", - "range-parser": "~1.2.0", - "safe-buffer": "5.1.2", - "send": "0.16.2", - "serve-static": "1.13.2", - "setprototypeof": "1.1.0", - "statuses": "~1.4.0", - "type-is": "~1.6.16", - "utils-merge": "1.0.1", - "vary": "~1.1.2" - } - }, - "finalhandler": { - "version": "1.1.1", - "resolved": "http://registry.npmjs.org/finalhandler/-/finalhandler-1.1.1.tgz", - "integrity": "sha512-Y1GUDo39ez4aHAw7MysnUD5JzYX+WaIj8I57kO3aEPT1fFRL4sr7mjei97FgnwhAyyzRYmQZaTHb2+9uZ1dPtg==", - "requires": { - "debug": "2.6.9", - "encodeurl": "~1.0.2", - "escape-html": "~1.0.3", - "on-finished": "~2.3.0", - "parseurl": "~1.3.2", - "statuses": "~1.4.0", - "unpipe": "~1.0.0" - } - }, - "foreachasync": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/foreachasync/-/foreachasync-3.0.0.tgz", - "integrity": "sha1-VQKYfchxS+M5IJfzLgBxyd7gfPY=" - }, - "forwarded": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz", - "integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ=" - }, - "fresh": { - "version": "0.5.2", - "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", - "integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac=" - }, - "handlebars": { - "version": "4.0.14", - "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.0.14.tgz", - "integrity": "sha512-E7tDoyAA8ilZIV3xDJgl18sX3M8xB9/fMw8+mfW4msLW8jlX97bAnWgT3pmaNXuvzIEgSBMnAHfuXsB2hdzfow==", - "requires": { - "async": "^2.5.0", - "optimist": "^0.6.1", - "source-map": "^0.6.1", - "uglify-js": "^3.1.4" - } - }, - "hbs": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/hbs/-/hbs-4.0.4.tgz", - "integrity": "sha512-esVlyV/V59mKkwFai5YmPRSNIWZzhqL5YMN0++ueMxyK1cCfPa5f6JiHtapPKAIVAhQR6rpGxow0troav9WMEg==", - "requires": { - "handlebars": "4.0.14", - "walk": "2.3.9" - } - }, - "http-errors": { - "version": "1.6.3", - "resolved": "http://registry.npmjs.org/http-errors/-/http-errors-1.6.3.tgz", - "integrity": "sha1-i1VoC7S+KDoLW/TqLjhYC+HZMg0=", - "requires": { - "depd": "~1.1.2", - "inherits": "2.0.3", - "setprototypeof": "1.1.0", - "statuses": ">= 1.4.0 < 2" - } - }, - "iconv-lite": { - "version": "0.4.23", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.23.tgz", - "integrity": "sha512-neyTUVFtahjf0mB3dZT77u+8O0QB89jFdnBkd5P1JgYPbPaia3gXXOVL2fq8VyU2gMMD7SaN7QukTB/pmXYvDA==", - "requires": { - "safer-buffer": ">= 2.1.2 < 3" - } - }, - "inherits": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", - "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" - }, - "ipaddr.js": { - "version": "1.8.0", - "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.8.0.tgz", - "integrity": "sha1-6qM9bd16zo9/b+DJygRA5wZzix4=" - }, - "lodash": { - "version": "4.17.15", - "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz", - "integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A==" - }, - "media-typer": { - "version": "0.3.0", - "resolved": "http://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", - "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=" - }, - "merge-descriptors": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", - "integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E=" - }, - "methods": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz", - "integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4=" - }, - "mime": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/mime/-/mime-1.4.1.tgz", - "integrity": "sha512-KI1+qOZu5DcW6wayYHSzR/tXKCDC5Om4s1z2QJjDULzLcmf3DvzS7oluY4HCTrc+9FiKmWUgeNLg7W3uIQvxtQ==" - }, - "mime-db": { - "version": "1.37.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.37.0.tgz", - "integrity": "sha512-R3C4db6bgQhlIhPU48fUtdVmKnflq+hRdad7IyKhtFj06VPNVdk2RhiYL3UjQIlso8L+YxAtFkobT0VK+S/ybg==" - }, - "mime-types": { - "version": "2.1.21", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.21.tgz", - "integrity": "sha512-3iL6DbwpyLzjR3xHSFNFeb9Nz/M8WDkX33t1GFQnFOllWk8pOrh/LSrB5OXlnlW5P9LH73X6loW/eogc+F5lJg==", - "requires": { - "mime-db": "~1.37.0" - } - }, - "minimist": { - "version": "0.0.10", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.10.tgz", - "integrity": "sha1-3j+YVD2/lggr5IrRoMfNqDYwHc8=" - }, - "ms": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", - "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" - }, - "negotiator": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.1.tgz", - "integrity": "sha1-KzJxhOiZIQEXeyhWP7XnECrNDKk=" - }, - "on-finished": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz", - "integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=", - "requires": { - "ee-first": "1.1.1" - } - }, - "optimist": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz", - "integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=", - "requires": { - "minimist": "~0.0.1", - "wordwrap": "~0.0.2" - } - }, - "parseurl": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.2.tgz", - "integrity": "sha1-/CidTtiZMRlGDBViUyYs3I3mW/M=" - }, - "path-to-regexp": { - "version": "0.1.7", - "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz", - "integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w=" - }, - "proxy-addr": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.4.tgz", - "integrity": "sha512-5erio2h9jp5CHGwcybmxmVqHmnCBZeewlfJ0pex+UW7Qny7OOZXTtH56TGNyBizkgiOwhJtMKrVzDTeKcySZwA==", - "requires": { - "forwarded": "~0.1.2", - "ipaddr.js": "1.8.0" - } - }, - "qs": { - "version": "6.5.2", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz", - "integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA==" - }, - "range-parser": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.0.tgz", - "integrity": "sha1-9JvmtIeJTdxA3MlKMi9hEJLgDV4=" - }, - "raw-body": { - "version": "2.3.3", - "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.3.3.tgz", - "integrity": "sha512-9esiElv1BrZoI3rCDuOuKCBRbuApGGaDPQfjSflGxdy4oyzqghxu6klEkkVIvBje+FF0BX9coEv8KqW6X/7njw==", - "requires": { - "bytes": "3.0.0", - "http-errors": "1.6.3", - "iconv-lite": "0.4.23", - "unpipe": "1.0.0" - } - }, - "redis": { - "version": "2.8.0", - "resolved": "https://registry.npmjs.org/redis/-/redis-2.8.0.tgz", - "integrity": "sha512-M1OkonEQwtRmZv4tEWF2VgpG0JWJ8Fv1PhlgT5+B+uNq2cA3Rt1Yt/ryoR+vQNOQcIEgdCdfH0jr3bDpihAw1A==", - "requires": { - "double-ended-queue": "^2.1.0-0", - "redis-commands": "^1.2.0", - "redis-parser": "^2.6.0" - }, - "dependencies": { - "redis-commands": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.4.0.tgz", - "integrity": "sha512-cu8EF+MtkwI4DLIT0x9P8qNTLFhQD4jLfxLR0cCNkeGzs87FN6879JOJwNQR/1zD7aSYNbU0hgsV9zGY71Itvw==" - }, - "redis-parser": { - "version": "2.6.0", - "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-2.6.0.tgz", - "integrity": "sha1-Uu0J2srBCPGmMcB+m2mUHnoZUEs=" - } - } - }, - "redis-commands": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.5.0.tgz", - "integrity": "sha512-6KxamqpZ468MeQC3bkWmCB1fp56XL64D4Kf0zJSwDZbVLLm7KFkoIcHrgRvQ+sk8dnhySs7+yBg94yIkAK7aJg==" - }, - "redis-parser": { - "version": "2.6.0", - "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-2.6.0.tgz", - "integrity": "sha1-Uu0J2srBCPGmMcB+m2mUHnoZUEs=" - }, - "safe-buffer": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", - "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" - }, - "safer-buffer": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", - "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" - }, - "secure-random-string": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/secure-random-string/-/secure-random-string-1.1.0.tgz", - "integrity": "sha512-V/h8jqoz58zklNGybVhP++cWrxEPXlLM/6BeJ4e0a8zlb4BsbYRzFs16snrxByPa5LUxCVTD3M6EYIVIHR1fAg==" - }, - "send": { - "version": "0.16.2", - "resolved": "https://registry.npmjs.org/send/-/send-0.16.2.tgz", - "integrity": "sha512-E64YFPUssFHEFBvpbbjr44NCLtI1AohxQ8ZSiJjQLskAdKuriYEP6VyGEsRDH8ScozGpkaX1BGvhanqCwkcEZw==", - "requires": { - "debug": "2.6.9", - "depd": "~1.1.2", - "destroy": "~1.0.4", - "encodeurl": "~1.0.2", - "escape-html": "~1.0.3", - "etag": "~1.8.1", - "fresh": "0.5.2", - "http-errors": "~1.6.2", - "mime": "1.4.1", - "ms": "2.0.0", - "on-finished": "~2.3.0", - "range-parser": "~1.2.0", - "statuses": "~1.4.0" - } - }, - "serve-static": { - "version": "1.13.2", - "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.13.2.tgz", - "integrity": "sha512-p/tdJrO4U387R9oMjb1oj7qSMaMfmOyd4j9hOFoxZe2baQszgHcSWjuya/CiT5kgZZKRudHNOA0pYXOl8rQ5nw==", - "requires": { - "encodeurl": "~1.0.2", - "escape-html": "~1.0.3", - "parseurl": "~1.3.2", - "send": "0.16.2" - } - }, - "setprototypeof": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.0.tgz", - "integrity": "sha512-BvE/TwpZX4FXExxOxZyRGQQv651MSwmWKZGqvmPcRIjDqWub67kTKuIMx43cZZrS/cBBzwBcNDWoFxt2XEFIpQ==" - }, - "source-map": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", - "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==" - }, - "statuses": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.4.0.tgz", - "integrity": "sha512-zhSCtt8v2NDrRlPQpCNtw/heZLtfUDqxBM1udqikb/Hbk52LK4nQSwr10u77iopCW5LsyHpuXS0GnEc48mLeew==" - }, - "type-is": { - "version": "1.6.16", - "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.16.tgz", - "integrity": "sha512-HRkVv/5qY2G6I8iab9cI7v1bOIdhm94dVjQCPFElW9W+3GeDOSHmy2EBYe4VTApuzolPcmgFTN3ftVJRKR2J9Q==", - "requires": { - "media-typer": "0.3.0", - "mime-types": "~2.1.18" - } - }, - "uglify-js": { - "version": "3.5.9", - "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.5.9.tgz", - "integrity": "sha512-WpT0RqsDtAWPNJK955DEnb6xjymR8Fn0OlK4TT4pS0ASYsVPqr5ELhgwOwLCP5J5vHeJ4xmMmz3DEgdqC10JeQ==", - "optional": true, - "requires": { - "commander": "~2.20.0", - "source-map": "~0.6.1" - } - }, - "unpipe": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", - "integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw=" - }, - "utils-merge": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", - "integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM=" - }, - "vary": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", - "integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw=" - }, - "walk": { - "version": "2.3.9", - "resolved": "https://registry.npmjs.org/walk/-/walk-2.3.9.tgz", - "integrity": "sha1-MbTbZnjyrgHDnqn7hyWpAx5Vins=", - "requires": { - "foreachasync": "^3.0.0" - } - }, - "wordwrap": { - "version": "0.0.3", - "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz", - "integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=" - } - } -} diff --git a/benchmarks/workloads/node_template/package.json b/benchmarks/workloads/node_template/package.json deleted file mode 100644 index 7dcadd523..000000000 --- a/benchmarks/workloads/node_template/package.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "nodedum", - "version": "1.0.0", - "description": "", - "main": "index.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "author": "", - "license": "ISC", - "dependencies": { - "express": "^4.16.4", - "hbs": "^4.0.4", - "redis": "^2.8.0", - "redis-commands": "^1.2.0", - "redis-parser": "^2.6.0", - "secure-random-string": "^1.1.0" - } -} diff --git a/benchmarks/workloads/redis/BUILD b/benchmarks/workloads/redis/BUILD deleted file mode 100644 index a70873065..000000000 --- a/benchmarks/workloads/redis/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/redis/Dockerfile b/benchmarks/workloads/redis/Dockerfile deleted file mode 100644 index 0f17249af..000000000 --- a/benchmarks/workloads/redis/Dockerfile +++ /dev/null @@ -1 +0,0 @@ -FROM redis:5.0.4 diff --git a/benchmarks/workloads/redisbenchmark/BUILD b/benchmarks/workloads/redisbenchmark/BUILD deleted file mode 100644 index 147cfedd2..000000000 --- a/benchmarks/workloads/redisbenchmark/BUILD +++ /dev/null @@ -1,28 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test") -load("//benchmarks:defs.bzl", "test_deps") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "redisbenchmark", - srcs = ["__init__.py"], -) - -py_test( - name = "redisbenchmark_test", - srcs = ["redisbenchmark_test.py"], - python_version = "PY3", - deps = test_deps + [ - ":redisbenchmark", - ], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/redisbenchmark/Dockerfile b/benchmarks/workloads/redisbenchmark/Dockerfile deleted file mode 100644 index f94f6442e..000000000 --- a/benchmarks/workloads/redisbenchmark/Dockerfile +++ /dev/null @@ -1,4 +0,0 @@ -FROM redis:5.0.4 -ENV host localhost -ENV port 6379 -CMD ["sh", "-c", "redis-benchmark --csv -h ${host} -p ${port} ${flags}"] diff --git a/benchmarks/workloads/redisbenchmark/__init__.py b/benchmarks/workloads/redisbenchmark/__init__.py deleted file mode 100644 index 229cef5fa..000000000 --- a/benchmarks/workloads/redisbenchmark/__init__.py +++ /dev/null @@ -1,85 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Redis-benchmark tool.""" - -import re - -OPERATIONS = [ - "PING_INLINE", - "PING_BULK", - "SET", - "GET", - "INCR", - "LPUSH", - "RPUSH", - "LPOP", - "RPOP", - "SADD", - "HSET", - "SPOP", - "LRANGE_100", - "LRANGE_300", - "LRANGE_500", - "LRANGE_600", - "MSET", -] - -METRICS = dict() - -SAMPLE_DATA = """ -"PING_INLINE","48661.80" -"PING_BULK","50301.81" -"SET","48923.68" -"GET","49382.71" -"INCR","49975.02" -"LPUSH","49875.31" -"RPUSH","50276.52" -"LPOP","50327.12" -"RPOP","50556.12" -"SADD","49504.95" -"HSET","49504.95" -"SPOP","50025.02" -"LPUSH (needed to benchmark LRANGE)","48875.86" -"LRANGE_100 (first 100 elements)","33955.86" -"LRANGE_300 (first 300 elements)","16550.81" -"LRANGE_500 (first 450 elements)","13653.74" -"LRANGE_600 (first 600 elements)","11219.57" -"MSET (10 keys)","44682.75" -""" - - -# pylint: disable=unused-argument -def sample(**kwargs) -> str: - return SAMPLE_DATA - - -# Bind a metric for each operation noted above. -for op in OPERATIONS: - - def bind(metric): - """Bind op to a new scope.""" - - # pylint: disable=unused-argument - def parse(data: str, **kwargs) -> float: - """Operation throughput in requests/sec.""" - regex = r"\"" + metric + r"( .*)?\",\"(\d*.\d*)" - res = re.compile(regex).search(data) - if res: - return float(res.group(2)) - return 0.0 - - parse.__name__ = metric - return parse - - METRICS[op] = bind(op) diff --git a/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py b/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py deleted file mode 100644 index 419ced059..000000000 --- a/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py +++ /dev/null @@ -1,51 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Parser test.""" - -import sys - -import pytest - -from benchmarks.workloads import redisbenchmark - -RESULTS = { - "PING_INLINE": 48661.80, - "PING_BULK": 50301.81, - "SET": 48923.68, - "GET": 49382.71, - "INCR": 49975.02, - "LPUSH": 49875.31, - "RPUSH": 50276.52, - "LPOP": 50327.12, - "RPOP": 50556.12, - "SADD": 49504.95, - "HSET": 49504.95, - "SPOP": 50025.02, - "LRANGE_100": 33955.86, - "LRANGE_300": 16550.81, - "LRANGE_500": 13653.74, - "LRANGE_600": 11219.57, - "MSET": 44682.75 -} - - -def test_metrics(): - """Test all metrics.""" - for (metric, func) in redisbenchmark.METRICS.items(): - res = func(redisbenchmark.sample()) - assert float(res) == RESULTS[metric] - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/ruby/BUILD b/benchmarks/workloads/ruby/BUILD deleted file mode 100644 index a3be4fe92..000000000 --- a/benchmarks/workloads/ruby/BUILD +++ /dev/null @@ -1,28 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -filegroup( - name = "files", - srcs = [ - "Dockerfile", - "Gemfile", - "Gemfile.lock", - "config.ru", - "index.rb", - ], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - "Gemfile", - "Gemfile.lock", - "config.ru", - "index.rb", - ], -) diff --git a/benchmarks/workloads/ruby/Dockerfile b/benchmarks/workloads/ruby/Dockerfile deleted file mode 100644 index a9a7a7086..000000000 --- a/benchmarks/workloads/ruby/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -# example based on https://github.com/errm/fib - -FROM ruby:2.5 - -RUN apt-get update -qq && apt-get install -y build-essential libpq-dev nodejs libsodium-dev - -# Set an environment variable where the Rails app is installed to inside of Docker image -ENV RAILS_ROOT /var/www/app_name -RUN mkdir -p $RAILS_ROOT - -# Set working directory -WORKDIR $RAILS_ROOT - -# Setting env up -ENV RAILS_ENV='production' -ENV RACK_ENV='production' - -# Adding gems -COPY Gemfile Gemfile -COPY Gemfile.lock Gemfile.lock -RUN bundle install --jobs 20 --retry 5 --without development test - -# Adding project files -COPY . . - -EXPOSE $PORT -STOPSIGNAL SIGINT -CMD ["bundle", "exec", "puma", "config.ru"] diff --git a/benchmarks/workloads/ruby/Gemfile b/benchmarks/workloads/ruby/Gemfile deleted file mode 100644 index 8f1bdad6e..000000000 --- a/benchmarks/workloads/ruby/Gemfile +++ /dev/null @@ -1,12 +0,0 @@ -source "https://rubygems.org" -# load a bunch of dependencies to take up memory -gem "sinatra" -gem "puma" -gem "redis" -gem 'rake' -gem 'squid', '~> 1.4' -gem 'cassandra-driver' -gem 'ruby-fann' -gem 'rbnacl' -gem 'bcrypt' -gem "activemerchant"
\ No newline at end of file diff --git a/benchmarks/workloads/ruby/Gemfile.lock b/benchmarks/workloads/ruby/Gemfile.lock deleted file mode 100644 index ea9f0ea85..000000000 --- a/benchmarks/workloads/ruby/Gemfile.lock +++ /dev/null @@ -1,71 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - activemerchant (1.105.0) - activesupport (>= 4.2) - builder (>= 2.1.2, < 4.0.0) - i18n (>= 0.6.9) - nokogiri (~> 1.4) - activesupport (5.2.3) - concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 0.7, < 2) - minitest (~> 5.1) - tzinfo (~> 1.1) - bcrypt (3.1.13) - builder (3.2.4) - cassandra-driver (3.2.3) - ione (~> 1.2) - concurrent-ruby (1.1.5) - ffi (1.12.2) - i18n (1.6.0) - concurrent-ruby (~> 1.0) - ione (1.2.4) - mini_portile2 (2.4.0) - minitest (5.11.3) - mustermann (1.0.3) - nokogiri (1.10.8) - mini_portile2 (~> 2.4.0) - pdf-core (0.7.0) - prawn (2.2.2) - pdf-core (~> 0.7.0) - ttfunk (~> 1.5) - puma (3.12.4) - rack (2.2.2) - rack-protection (2.0.5) - rack - rake (12.3.3) - rbnacl (7.1.1) - ffi - redis (4.1.1) - ruby-fann (1.2.6) - sinatra (2.0.5) - mustermann (~> 1.0) - rack (~> 2.0) - rack-protection (= 2.0.5) - tilt (~> 2.0) - squid (1.4.1) - activesupport (>= 4.0) - prawn (~> 2.2) - thread_safe (0.3.6) - tilt (2.0.9) - ttfunk (1.5.1) - tzinfo (1.2.5) - thread_safe (~> 0.1) - -PLATFORMS - ruby - -DEPENDENCIES - activemerchant - bcrypt - cassandra-driver - puma - rake - rbnacl - redis - ruby-fann - sinatra - squid (~> 1.4) - -BUNDLED WITH - 1.17.1 diff --git a/benchmarks/workloads/ruby/config.ru b/benchmarks/workloads/ruby/config.ru deleted file mode 100755 index fbd5acc82..000000000 --- a/benchmarks/workloads/ruby/config.ru +++ /dev/null @@ -1,2 +0,0 @@ -require './index' -run Sinatra::Application
\ No newline at end of file diff --git a/benchmarks/workloads/ruby/index.rb b/benchmarks/workloads/ruby/index.rb deleted file mode 100755 index 5fa85af93..000000000 --- a/benchmarks/workloads/ruby/index.rb +++ /dev/null @@ -1,14 +0,0 @@ -require "sinatra" -require "puma" -require "redis" -require "rake" -require "squid" -require "cassandra" -require "ruby-fann" -require "rbnacl" -require "bcrypt" -require "activemerchant" - -get "/" do - "Hello World!" -end
\ No newline at end of file diff --git a/benchmarks/workloads/ruby_template/BUILD b/benchmarks/workloads/ruby_template/BUILD deleted file mode 100644 index 72ed9403d..000000000 --- a/benchmarks/workloads/ruby_template/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - "Gemfile", - "Gemfile.lock", - "config.ru", - "index.erb", - "main.rb", - ], -) diff --git a/benchmarks/workloads/sleep/BUILD b/benchmarks/workloads/sleep/BUILD deleted file mode 100644 index a70873065..000000000 --- a/benchmarks/workloads/sleep/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/sleep/Dockerfile b/benchmarks/workloads/sleep/Dockerfile deleted file mode 100644 index 24c72e07a..000000000 --- a/benchmarks/workloads/sleep/Dockerfile +++ /dev/null @@ -1,3 +0,0 @@ -FROM alpine:latest - -CMD ["sleep", "315360000"] diff --git a/benchmarks/workloads/sysbench/BUILD b/benchmarks/workloads/sysbench/BUILD deleted file mode 100644 index ab2556064..000000000 --- a/benchmarks/workloads/sysbench/BUILD +++ /dev/null @@ -1,28 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test") -load("//benchmarks:defs.bzl", "test_deps") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "sysbench", - srcs = ["__init__.py"], -) - -py_test( - name = "sysbench_test", - srcs = ["sysbench_test.py"], - python_version = "PY3", - deps = test_deps + [ - ":sysbench", - ], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/sysbench/Dockerfile b/benchmarks/workloads/sysbench/Dockerfile deleted file mode 100644 index 8225e0e14..000000000 --- a/benchmarks/workloads/sysbench/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -FROM ubuntu:18.04 - -RUN set -x \ - && apt-get update \ - && apt-get install -y \ - sysbench \ - && rm -rf /var/lib/apt/lists/* - -# Parameterize the tests. -ENV test cpu -ENV threads 1 -ENV options "" - -# run sysbench once as a warm-up and take the second result -CMD ["sh", "-c", "sysbench --threads=8 --memory-total-size=5G memory run > /dev/null && \ -sysbench --threads=${threads} ${options} ${test} run"] diff --git a/benchmarks/workloads/sysbench/__init__.py b/benchmarks/workloads/sysbench/__init__.py deleted file mode 100644 index de357b4db..000000000 --- a/benchmarks/workloads/sysbench/__init__.py +++ /dev/null @@ -1,167 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Sysbench.""" - -import re - -STD_REGEX = r"events per second:\s*(\d*.?\d*)\n" -MEM_REGEX = r"Total\soperations:\s+\d*\s*\((\d*\.\d*)\sper\ssecond\)" -ALT_REGEX = r"execution time \(avg/stddev\):\s*(\d*.?\d*)/(\d*.?\d*)" -AVG_REGEX = r"avg:[^\n^\d]*(\d*\.?\d*)" - -SAMPLE_CPU_DATA = """ -sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3) - -Running the test with following options: -Number of threads: 8 -Initializing random number generator from current time - - -Prime numbers limit: 10000 - -Initializing worker threads... - -Threads started! - -CPU speed: - events per second: 9093.38 - -General statistics: - total time: 10.0007s - total number of events: 90949 - -Latency (ms): - min: 0.64 - avg: 0.88 - max: 24.65 - 95th percentile: 1.55 - sum: 79936.91 - -Threads fairness: - events (avg/stddev): 11368.6250/831.38 - execution time (avg/stddev): 9.9921/0.01 -""" - -SAMPLE_MEMORY_DATA = """ -sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3) - -Running the test with following options: -Number of threads: 8 -Initializing random number generator from current time - - -Running memory speed test with the following options: - block size: 1KiB - total size: 102400MiB - operation: write - scope: global - -Initializing worker threads... - -Threads started! - -Total operations: 47999046 (9597428.64 per second) - -46874.07 MiB transferred (9372.49 MiB/sec) - - -General statistics: - total time: 5.0001s - total number of events: 47999046 - -Latency (ms): - min: 0.00 - avg: 0.00 - max: 0.21 - 95th percentile: 0.00 - sum: 33165.91 - -Threads fairness: - events (avg/stddev): 5999880.7500/111242.52 - execution time (avg/stddev): 4.1457/0.09 -""" - -SAMPLE_MUTEX_DATA = """ -sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3) - -Running the test with following options: -Number of threads: 8 -Initializing random number generator from current time - - -Initializing worker threads... - -Threads started! - - -General statistics: - total time: 3.7869s - total number of events: 8 - -Latency (ms): - min: 3688.56 - avg: 3754.03 - max: 3780.94 - 95th percentile: 3773.42 - sum: 30032.28 - -Threads fairness: - events (avg/stddev): 1.0000/0.00 - execution time (avg/stddev): 3.7540/0.03 -""" - - -# pylint: disable=unused-argument -def sample(test, **kwargs): - switch = { - "cpu": SAMPLE_CPU_DATA, - "memory": SAMPLE_MEMORY_DATA, - "mutex": SAMPLE_MUTEX_DATA, - "randwr": SAMPLE_CPU_DATA - } - return switch[test] - - -# pylint: disable=unused-argument -def cpu_events_per_second(data: str, **kwargs) -> float: - """Returns events per second.""" - return float(re.compile(STD_REGEX).search(data).group(1)) - - -# pylint: disable=unused-argument -def memory_ops_per_second(data: str, **kwargs) -> float: - """Returns memory operations per second.""" - return float(re.compile(MEM_REGEX).search(data).group(1)) - - -# pylint: disable=unused-argument -def mutex_time(data: str, count: int, locks: int, threads: int, - **kwargs) -> float: - """Returns normalized mutex time (lower is better).""" - value = float(re.compile(ALT_REGEX).search(data).group(1)) - contention = float(threads) / float(locks) - scale = contention * float(count) / 100000000.0 - return value / scale - - -# pylint: disable=unused-argument -def mutex_deviation(data: str, **kwargs) -> float: - """Returns deviation for threads.""" - return float(re.compile(ALT_REGEX).search(data).group(2)) - - -# pylint: disable=unused-argument -def mutex_latency(data: str, **kwargs) -> float: - """Returns average mutex latency.""" - return float(re.compile(AVG_REGEX).search(data).group(1)) diff --git a/benchmarks/workloads/sysbench/sysbench_test.py b/benchmarks/workloads/sysbench/sysbench_test.py deleted file mode 100644 index 3fb541fd2..000000000 --- a/benchmarks/workloads/sysbench/sysbench_test.py +++ /dev/null @@ -1,34 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Parser test.""" - -import sys - -import pytest - -from benchmarks.workloads import sysbench - - -def test_sysbench_parser(): - """Test the basic parser.""" - assert sysbench.cpu_events_per_second(sysbench.sample("cpu")) == 9093.38 - assert sysbench.memory_ops_per_second(sysbench.sample("memory")) == 9597428.64 - assert sysbench.mutex_time(sysbench.sample("mutex"), 1, 1, - 100000000.0) == 3.754 - assert sysbench.mutex_deviation(sysbench.sample("mutex")) == 0.03 - assert sysbench.mutex_latency(sysbench.sample("mutex")) == 3754.03 - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/syscall/BUILD b/benchmarks/workloads/syscall/BUILD deleted file mode 100644 index f8c43bca1..000000000 --- a/benchmarks/workloads/syscall/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test") -load("//benchmarks:defs.bzl", "test_deps") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "syscall", - srcs = ["__init__.py"], -) - -py_test( - name = "syscall_test", - srcs = ["syscall_test.py"], - python_version = "PY3", - deps = test_deps + [ - ":syscall", - ], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - "syscall.c", - ], -) diff --git a/benchmarks/workloads/syscall/Dockerfile b/benchmarks/workloads/syscall/Dockerfile deleted file mode 100644 index a2088d953..000000000 --- a/benchmarks/workloads/syscall/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM gcc:latest -COPY . /usr/src/syscall -WORKDIR /usr/src/syscall -RUN gcc -O2 -o syscall syscall.c -ENV count 1000000 -CMD ["sh", "-c", "./syscall ${count}"] diff --git a/benchmarks/workloads/syscall/__init__.py b/benchmarks/workloads/syscall/__init__.py deleted file mode 100644 index dc9028faa..000000000 --- a/benchmarks/workloads/syscall/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Simple syscall test.""" - -import re - -SAMPLE_DATA = "Called getpid syscall 1000000 times: 1117 ms, 500 ns each." - - -# pylint: disable=unused-argument -def sample(**kwargs) -> str: - return SAMPLE_DATA - - -# pylint: disable=unused-argument -def syscall_time_ns(data: str, **kwargs) -> int: - """Returns average system call time.""" - return float(re.compile(r"(\d+)\sns each.").search(data).group(1)) diff --git a/benchmarks/workloads/syscall/syscall.c b/benchmarks/workloads/syscall/syscall.c deleted file mode 100644 index ded030397..000000000 --- a/benchmarks/workloads/syscall/syscall.c +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#define _GNU_SOURCE -#include <stdio.h> -#include <stdlib.h> -#include <sys/syscall.h> -#include <sys/types.h> -#include <time.h> -#include <unistd.h> - -// Short program that calls getpid() a number of times and outputs time -// diference from the MONOTONIC clock. -int main(int argc, char** argv) { - struct timespec start, stop; - long result; - char buf[80]; - - if (argc < 2) { - printf("Usage:./syscall NUM_TIMES_TO_CALL"); - return 1; - } - - if (clock_gettime(CLOCK_MONOTONIC, &start)) return 1; - - long loops = atoi(argv[1]); - for (long i = 0; i < loops; i++) { - syscall(SYS_gettimeofday, 0, 0); - } - - if (clock_gettime(CLOCK_MONOTONIC, &stop)) return 1; - - if ((stop.tv_nsec - start.tv_nsec) < 0) { - result = (stop.tv_sec - start.tv_sec - 1) * 1000; - result += (stop.tv_nsec - start.tv_nsec + 1000000000) / (1000 * 1000); - } else { - result = (stop.tv_sec - start.tv_sec) * 1000; - result += (stop.tv_nsec - start.tv_nsec) / (1000 * 1000); - } - - printf("Called getpid syscall %d times: %lu ms, %lu ns each.\n", loops, - result, result * 1000000 / loops); - - return 0; -} diff --git a/benchmarks/workloads/syscall/syscall_test.py b/benchmarks/workloads/syscall/syscall_test.py deleted file mode 100644 index 72f027de1..000000000 --- a/benchmarks/workloads/syscall/syscall_test.py +++ /dev/null @@ -1,27 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys - -import pytest - -from benchmarks.workloads import syscall - - -def test_syscall_time_ns(): - assert syscall.syscall_time_ns(syscall.sample()) == 500 - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/tensorflow/BUILD b/benchmarks/workloads/tensorflow/BUILD deleted file mode 100644 index a7b7742f4..000000000 --- a/benchmarks/workloads/tensorflow/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -py_library( - name = "tensorflow", - srcs = ["__init__.py"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], -) diff --git a/benchmarks/workloads/tensorflow/Dockerfile b/benchmarks/workloads/tensorflow/Dockerfile deleted file mode 100644 index eefe6b3eb..000000000 --- a/benchmarks/workloads/tensorflow/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM tensorflow/tensorflow:1.13.2 - -RUN apt-get update \ - && apt-get install -y git -RUN git clone --depth 1 https://github.com/aymericdamien/TensorFlow-Examples.git -RUN python -m pip install --no-cache-dir -U pip setuptools -RUN python -m pip install --no-cache-dir matplotlib - -WORKDIR /TensorFlow-Examples/examples - -ENV PYTHONPATH="$PYTHONPATH:/TensorFlow-Examples/examples" - -ENV workload "3_NeuralNetworks/convolutional_network.py" -CMD python ${workload} diff --git a/benchmarks/workloads/tensorflow/__init__.py b/benchmarks/workloads/tensorflow/__init__.py deleted file mode 100644 index b5ec213f8..000000000 --- a/benchmarks/workloads/tensorflow/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# python3 -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""A Tensorflow example.""" - - -# pylint: disable=unused-argument -def run_time(value, **kwargs): - """Returns the startup and runtime of the Tensorflow workload in seconds.""" - return value diff --git a/benchmarks/workloads/true/BUILD b/benchmarks/workloads/true/BUILD deleted file mode 100644 index eba23d325..000000000 --- a/benchmarks/workloads/true/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -load("//tools:defs.bzl", "pkg_tar") - -package( - default_visibility = ["//benchmarks:__subpackages__"], - licenses = ["notice"], -) - -pkg_tar( - name = "tar", - srcs = [ - "Dockerfile", - ], - extension = "tar", -) diff --git a/g3doc/BUILD b/g3doc/BUILD index c315d38be..f91a77b6f 100644 --- a/g3doc/BUILD +++ b/g3doc/BUILD @@ -31,7 +31,7 @@ doc( category = "Project", permalink = "/community/", subcategory = "Community", - weight = "95", + weight = "10", ) doc( @@ -40,5 +40,5 @@ doc( category = "Project", permalink = "/community/style_guide/", subcategory = "Community", - weight = "10", + weight = "99", ) diff --git a/g3doc/user_guide/BUILD b/g3doc/user_guide/BUILD index 355dd49b3..b69aee12c 100644 --- a/g3doc/user_guide/BUILD +++ b/g3doc/user_guide/BUILD @@ -68,12 +68,3 @@ doc( permalink = "/docs/user_guide/platforms/", weight = "30", ) - -doc( - name = "runtimeclass", - src = "runtimeclass.md", - category = "User Guide", - permalink = "/docs/user_guide/runtimeclass/", - subcategory = "Advanced", - weight = "91", -) diff --git a/g3doc/user_guide/containerd/BUILD b/g3doc/user_guide/containerd/BUILD new file mode 100644 index 000000000..979d46105 --- /dev/null +++ b/g3doc/user_guide/containerd/BUILD @@ -0,0 +1,33 @@ +load("//website:defs.bzl", "doc") + +package( + default_visibility = ["//website:__pkg__"], + licenses = ["notice"], +) + +doc( + name = "quick_start", + src = "quick_start.md", + category = "User Guide", + permalink = "/docs/user_guide/containerd/quick_start/", + subcategory = "Containerd", + weight = "10", +) + +doc( + name = "configuration", + src = "configuration.md", + category = "User Guide", + permalink = "/docs/user_guide/containerd/configuration/", + subcategory = "Containerd", + weight = "90", +) + +doc( + name = "containerd_11", + src = "containerd_11.md", + category = "User Guide", + permalink = "/docs/user_guide/containerd/containerd_11/", + subcategory = "Containerd", + weight = "99", +) diff --git a/g3doc/user_guide/containerd/configuration.md b/g3doc/user_guide/containerd/configuration.md new file mode 100644 index 000000000..5d485c24b --- /dev/null +++ b/g3doc/user_guide/containerd/configuration.md @@ -0,0 +1,70 @@ +# Containerd Advanced Configuration + +This document describes how to configure runtime options for +`containerd-shim-runsc-v1`. This follows the +[Containerd Quick Start](./quick_start.md) and requires containerd 1.2 or later. + +### Update `/etc/containerd/config.toml` to point to a configuration file for `containerd-shim-runsc-v1`. + +`containerd-shim-runsc-v1` supports a few different configuration options based +on the version of containerd that is used. For versions >= 1.3, it supports a +configurable `ConfigPath` in the containerd runtime configuration. + +```shell +cat <<EOF | sudo tee /etc/containerd/config.toml +disabled_plugins = ["restart"] +[plugins.linux] + shim_debug = true +[plugins.cri.containerd.runtimes.runsc] + runtime_type = "io.containerd.runsc.v1" +[plugins.cri.containerd.runtimes.runsc.options] + TypeUrl = "io.containerd.runsc.v1.options" + # containerd 1.3 only! + ConfigPath = "/etc/containerd/runsc.toml" +EOF +``` + +When you are done restart containerd to pick up the new configuration files. + +```shell +sudo systemctl restart containerd +``` + +### Configure `/etc/containerd/runsc.toml` + +> Note: For containerd 1.2, the config file should named `config.toml` and +> located in the runtime root. By default, this is `/run/containerd/runsc`. + +The set of options that can be configured can be found in +[options.go](https://github.com/google/gvisor/blob/master/pkg/shim/v2/options/options.go). + +#### Example: Enable the KVM platform + +gVisor enables the use of a number of platforms. This example shows how to +configure `containerd-shim-runsc-v1` to use gvisor with the KVM platform. + +Find out more about platform in the +[Platforms Guide](../../architecture_guide/platforms.md). + +```shell +cat <<EOF | sudo tee /etc/containerd/runsc.toml +[runsc_config] +platform = "kvm" +EOF +``` + +### Example: Enable gVisor debug logging + +gVisor debug logging can be enabled by setting the `debug` and `debug-log` flag. +The shim will replace "%ID%" with the container ID, and "%COMMAND%" with the +runsc command (run, boot, etc.) in the path of the `debug-log` flag. + +Find out more about debugging in the [debugging guide](../debugging.md). + +```shell +cat <<EOF | sudo tee /etc/containerd/runsc.toml +[runsc_config] + debug=true + debug-log=/var/log/%ID%/gvisor.%COMMAND%.log +EOF +``` diff --git a/g3doc/user_guide/containerd/containerd_11.md b/g3doc/user_guide/containerd/containerd_11.md new file mode 100644 index 000000000..50befbdf4 --- /dev/null +++ b/g3doc/user_guide/containerd/containerd_11.md @@ -0,0 +1,163 @@ +# Older Versions (containerd 1.1) + +This document describes how to install and run the `gvisor-containerd-shim` +using the untrusted workload CRI extension. This requires `containerd` 1.1 or +later. + +*Note: The untrusted workload CRI extension is deprecated by containerd and +`gvisor-containerd-shim` is maintained on a best-effort basis. If you are using +containerd 1.2+, please see the +[containerd 1.2+ documentation](./quick_start.md) and use +`containerd-shim-runsc-v1`.* + +## Requirements + +- **runsc** and **gvisor-containerd-shim**: See the + [installation guide](/docs/user_guide/install/). +- **containerd**: See the [containerd website](https://containerd.io/) for + information on how to install containerd. + +## Configure containerd + +Create the configuration for the gvisor shim in +`/etc/containerd/gvisor-containerd-shim.toml`: + +```shell +cat <<EOF | sudo tee /etc/containerd/gvisor-containerd-shim.toml +# This is the path to the default runc containerd-shim. +runc_shim = "/usr/local/bin/containerd-shim" +EOF +``` + +Update `/etc/containerd/config.toml`. Be sure to update the path to +`gvisor-containerd-shim` and `runsc` if necessary: + +```shell +cat <<EOF | sudo tee /etc/containerd/config.toml +disabled_plugins = ["restart"] +[plugins.linux] + shim = "/usr/local/bin/gvisor-containerd-shim" + shim_debug = true +[plugins.cri.containerd.untrusted_workload_runtime] + runtime_type = "io.containerd.runtime.v1.linux" + runtime_engine = "/usr/local/bin/runsc" + runtime_root = "/run/containerd/runsc" +EOF +``` + +Restart `containerd`: + +```shell +sudo systemctl restart containerd +``` + +## Usage + +You can run containers in gVisor via containerd's CRI. + +### Install crictl + +Download and install the `crictl` binary: + +```shell +{ +wget https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.13.0/crictl-v1.13.0-linux-amd64.tar.gz +tar xf crictl-v1.13.0-linux-amd64.tar.gz +sudo mv crictl /usr/local/bin +} +``` + +Write the `crictl` configuration file: + +```shell +cat <<EOF | sudo tee /etc/crictl.yaml +runtime-endpoint: unix:///run/containerd/containerd.sock +EOF +``` + +### Create the nginx Sandbox in gVisor + +Pull the nginx image: + +```shell +sudo crictl pull nginx +``` + +Create the sandbox creation request: + +```shell +cat <<EOF | tee sandbox.json +{ + "metadata": { + "name": "nginx-sandbox", + "namespace": "default", + "attempt": 1, + "uid": "hdishd83djaidwnduwk28bcsb" + }, + "annotations": { + "io.kubernetes.cri.untrusted-workload": "true" + }, + "linux": { + }, + "log_directory": "/tmp" +} +EOF +``` + +Create the pod in gVisor: + +```shell +SANDBOX_ID=$(sudo crictl runp sandbox.json) +``` + +### Run the nginx Container in the Sandbox + +Create the nginx container creation request: + +```shell +cat <<EOF | tee container.json +{ + "metadata": { + "name": "nginx" + }, + "image":{ + "image": "nginx" + }, + "log_path":"nginx.0.log", + "linux": { + } +} +EOF +``` + +Create the nginx container: + +```shell +CONTAINER_ID=$(sudo crictl create ${SANDBOX_ID} container.json sandbox.json) +``` + +Start the nginx container: + +```shell +sudo crictl start ${CONTAINER_ID} +``` + +### Validate the container + +Inspect the created pod: + +```shell +sudo crictl inspectp ${SANDBOX_ID} +``` + +Inspect the nginx container: + +```shell +sudo crictl inspect ${CONTAINER_ID} +``` + +Verify that nginx is running in gVisor: + +```shell +sudo crictl exec ${CONTAINER_ID} dmesg | grep -i gvisor +``` diff --git a/g3doc/user_guide/containerd/quick_start.md b/g3doc/user_guide/containerd/quick_start.md new file mode 100644 index 000000000..2f67eecb3 --- /dev/null +++ b/g3doc/user_guide/containerd/quick_start.md @@ -0,0 +1,176 @@ +# Containerd Quick Start + +This document describes how to install and configure `containerd-shim-runsc-v1` +using the containerd runtime handler support on `containerd` 1.2 or later. + +## Requirements + +- **runsc** and **containerd-shim-runsc-v1**: See the + [installation guide](/docs/user_guide/install/). +- **containerd**: See the [containerd website](https://containerd.io/) for + information on how to install containerd. + +## Configure containerd + +Update `/etc/containerd/config.toml`. Make sure `containerd-shim-runsc-v1` is in +`${PATH}` or in the same directory as `containerd` binary. + +```shell +cat <<EOF | sudo tee /etc/containerd/config.toml +disabled_plugins = ["restart"] +[plugins.linux] + shim_debug = true +[plugins.cri.containerd.runtimes.runsc] + runtime_type = "io.containerd.runsc.v1" +EOF +``` + +Restart `containerd`: + +```shell +sudo systemctl restart containerd +``` + +## Usage + +You can run containers in gVisor via containerd's CRI. + +### Install crictl + +Download and install the `crictl`` binary: + +```shell +{ +wget https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.13.0/crictl-v1.13.0-linux-amd64.tar.gz +tar xf crictl-v1.13.0-linux-amd64.tar.gz +sudo mv crictl /usr/local/bin +} +``` + +Write the `crictl` configuration file: + +```shell +cat <<EOF | sudo tee /etc/crictl.yaml +runtime-endpoint: unix:///run/containerd/containerd.sock +EOF +``` + +### Create the nginx sandbox in gVisor + +Pull the nginx image: + +```shell +sudo crictl pull nginx +``` + +Create the sandbox creation request: + +```shell +cat <<EOF | tee sandbox.json +{ + "metadata": { + "name": "nginx-sandbox", + "namespace": "default", + "attempt": 1, + "uid": "hdishd83djaidwnduwk28bcsb" + }, + "linux": { + }, + "log_directory": "/tmp" +} +EOF +``` + +Create the pod in gVisor: + +```shell +SANDBOX_ID=$(sudo crictl runp --runtime runsc sandbox.json) +``` + +### Run the nginx container in the sandbox + +Create the nginx container creation request: + +```shell +cat <<EOF | tee container.json +{ + "metadata": { + "name": "nginx" + }, + "image":{ + "image": "nginx" + }, + "log_path":"nginx.0.log", + "linux": { + } +} +EOF +``` + +Create the nginx container: + +```shell +CONTAINER_ID=$(sudo crictl create ${SANDBOX_ID} container.json sandbox.json) +``` + +Start the nginx container: + +```shell +sudo crictl start ${CONTAINER_ID} +``` + +### Validate the container + +Inspect the created pod: + +```shell +sudo crictl inspectp ${SANDBOX_ID} +``` + +Inspect the nginx container: + +```shell +sudo crictl inspect ${CONTAINER_ID} +``` + +Verify that nginx is running in gVisor: + +```shell +sudo crictl exec ${CONTAINER_ID} dmesg | grep -i gvisor +``` + +### Set up the Kubernetes RuntimeClass + +Install the RuntimeClass for gVisor: + +```shell +cat <<EOF | kubectl apply -f - +apiVersion: node.k8s.io/v1beta1 +kind: RuntimeClass +metadata: + name: gvisor +handler: runsc +EOF +``` + +Create a Pod with the gVisor RuntimeClass: + +```shell +cat <<EOF | kubectl apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: nginx-gvisor +spec: + runtimeClassName: gvisor + containers: + - name: nginx + image: nginx +EOF +``` + +Verify that the Pod is running: + +```shell +kubectl get pod nginx-gvisor -o wide +``` diff --git a/g3doc/user_guide/quick_start/kubernetes.md b/g3doc/user_guide/quick_start/kubernetes.md index f875d8002..395cd4b71 100644 --- a/g3doc/user_guide/quick_start/kubernetes.md +++ b/g3doc/user_guide/quick_start/kubernetes.md @@ -6,17 +6,15 @@ with Kubernetes. ## Using Minikube gVisor can run sandboxed containers in a Kubernetes cluster with Minikube. After -the gVisor addon is enabled, pods with `io.kubernetes.cri.untrusted-workload` +the gVisor addon is enabled, pods with a `gvisor` [Runtime Class][runtimeclass] set to true will execute with `runsc`. Follow [these instructions][minikube] to enable gVisor addon. ## Using Containerd -You can also setup Kubernetes nodes to run pods in gvisor using the -[containerd][containerd] CRI runtime and the `gvisor-containerd-shim`. You can -use either the `io.kubernetes.cri.untrusted-workload` annotation or -[RuntimeClass][runtimeclass] to run Pods with `runsc`. You can find instructions -[here][gvisor-containerd-shim]. +You can also setup Kubernetes nodes to run pods in gVisor using +[containerd][containerd] and the gVisor containerd shim. You can find +instructions in the [Containerd Quick Start][gvisor-containerd]. ## Using GKE Sandbox @@ -31,6 +29,6 @@ WordPress site. You can view the full documentation [here][gke-sandbox-docs]. [gke]: https://cloud.google.com/kubernetes-engine/ [gke-sandbox]: https://cloud.google.com/kubernetes-engine/sandbox/ [gke-sandbox-docs]: https://cloud.google.com/kubernetes-engine/docs/how-to/sandbox-pods -[gvisor-containerd-shim]: https://github.com/google/gvisor-containerd-shim +[gvisor-containerd]: /docs/user_guide/containerd/quick_start/ [runtimeclass]: https://kubernetes.io/docs/concepts/containers/runtime-class/ [wordpress-quick]: /docs/tutorials/kubernetes/ diff --git a/g3doc/user_guide/runtimeclass.md b/g3doc/user_guide/runtimeclass.md deleted file mode 100644 index 2e2d997be..000000000 --- a/g3doc/user_guide/runtimeclass.md +++ /dev/null @@ -1,46 +0,0 @@ -# RuntimeClass - -First, follow the appropriate installation instructions for your version of -containerd. - -* For 1.1 or lower, use `gvisor-containerd-shim`. -* For 1.2 or higher, use `containerd-shim-runsc-v1`. - -# Set up the Kubernetes RuntimeClass - -Creating the [RuntimeClass][runtimeclass] in Kubernetes is simple once the -runtime is available for containerd: - -```shell -cat <<EOF | kubectl apply -f - -apiVersion: node.k8s.io/v1beta1 -kind: RuntimeClass -metadata: - name: gvisor -handler: runsc -EOF -``` - -Pods can now be created using this RuntimeClass: - -```shell -cat <<EOF | kubectl apply -f - -apiVersion: v1 -kind: Pod -metadata: - name: nginx-gvisor -spec: - runtimeClassName: gvisor - containers: - - name: nginx - image: nginx -EOF -``` - -You can verify that the Pod is running via this RuntimeClass: - -```shell -kubectl get pod nginx-gvisor -o wide -``` - -[runtimeclass]: https://kubernetes.io/docs/concepts/containers/runtime-class/ diff --git a/g3doc/user_guide/tutorials/BUILD b/g3doc/user_guide/tutorials/BUILD index caae98623..405026a33 100644 --- a/g3doc/user_guide/tutorials/BUILD +++ b/g3doc/user_guide/tutorials/BUILD @@ -11,16 +11,7 @@ doc( category = "User Guide", permalink = "/docs/tutorials/docker/", subcategory = "Tutorials", - weight = "21", -) - -doc( - name = "cni", - src = "cni.md", - category = "User Guide", - permalink = "/docs/tutorials/cni/", - subcategory = "Tutorials", - weight = "22", + weight = "10", ) doc( @@ -33,5 +24,14 @@ doc( ], permalink = "/docs/tutorials/kubernetes/", subcategory = "Tutorials", - weight = "33", + weight = "20", +) + +doc( + name = "cni", + src = "cni.md", + category = "User Guide", + permalink = "/docs/tutorials/cni/", + subcategory = "Tutorials", + weight = "30", ) @@ -22,6 +22,7 @@ require ( github.com/docker/go-connections v0.3.0 // indirect github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c // indirect github.com/docker/go-units v0.4.0 // indirect + github.com/dpjacques/clockwork v0.1.1-0.20190114191937-d864eecc357b // indirect github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e // indirect github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079 // indirect github.com/gogo/googleapis v1.4.0 // indirect @@ -43,7 +44,6 @@ require ( github.com/vishvananda/netns v0.0.0-20200520041808-52d707b772fe // indirect go.uber.org/atomic v1.6.0 // indirect go.uber.org/multierr v1.2.0 // indirect - golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527 // indirect golang.org/x/time v0.0.0-20191024005414-555d28b269f0 // indirect golang.org/x/tools v0.0.0-20200707200213-416e8f4faf8a // indirect google.golang.org/grpc v1.29.0 // indirect @@ -74,6 +74,8 @@ github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c h1:+pKlWGMw7gf6bQ github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c/go.mod h1:Uw6UezgYA44ePAFQYUehOuCzmy5zmg/+nl2ZfMWGkpA= github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/dpjacques/clockwork v0.1.1-0.20190114191937-d864eecc357b h1:7krODee+eIlZYoLiEDmP1kLFNCvd0bQ0eEXOympdN6U= +github.com/dpjacques/clockwork v0.1.1-0.20190114191937-d864eecc357b/go.mod h1:D8mP2A8vVT2GkXqPorSBmhnshhkFBYgzhA90KmJt25Y= github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= @@ -288,6 +290,7 @@ golang.org/x/sys v0.0.0-20191210023423-ac6580df4449/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200120151820-655fe14d7479/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200217220822-9197077df867/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200523222454-059865788121 h1:rITEj+UZHYC927n8GT97eC3zrpzXdb/voyeOuVKS46o= golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/images/Makefile b/images/Makefile index 9de359a28..278dec02f 100644 --- a/images/Makefile +++ b/images/Makefile @@ -59,9 +59,9 @@ local_image = $(LOCAL_IMAGE_PREFIX)/$(subst _,/,$(1)) # we need to explicitly repull the base layer in order to ensure that the # architecture is correct. Note that we use the term "rebuild" here to avoid # conflicting with the bazel "build" terminology, which is used elsewhere. +rebuild-%: FROM=$(shell grep FROM $(call path,$*)/Dockerfile } cut -d' ' -f2) rebuild-%: register-cross - FROM=$(shell grep FROM $(call path,$*)/Dockerfile | cut -d' ' -f2-) && \ - docker pull $(DOCKER_PLATFORM_ARGS) $$FROM + $(foreach IMAGE,$(FROM),docker $(DOCKER_PLATFORM_ARGS) $(IMAGE); &&) true T=$$(mktemp -d) && cp -a $(call path,$*)/* $$T && \ docker build $(DOCKER_PLATFORM_ARGS) -t $(call remote_image,$*) $$T && \ rm -rf $$T diff --git a/images/README.md b/images/README.md index 63ce46277..9880946a6 100644 --- a/images/README.md +++ b/images/README.md @@ -7,7 +7,7 @@ Note that all these images must be pushed to the testing project hosted on continuous integration. This will speed up loading as images will not need to be built from scratch for each test run. -Image tooling is accessible via `make`, specifically via `tools/images.mk`. +Image tooling is accessible via `make`, specifically via `images/Makefile`. ## Why make? diff --git a/images/basic/hostoverlaytest/Dockerfile b/images/basic/hostoverlaytest/Dockerfile index d83439e9c..6cef1a542 100644 --- a/images/basic/hostoverlaytest/Dockerfile +++ b/images/basic/hostoverlaytest/Dockerfile @@ -4,4 +4,5 @@ WORKDIR /root COPY . . RUN apt-get update && apt-get install -y gcc -RUN gcc -O2 -o test test.c +RUN gcc -O2 -o test_copy_up test_copy_up.c +RUN gcc -O2 -o test_rewinddir test_rewinddir.c diff --git a/images/basic/hostoverlaytest/testfile.txt b/images/basic/hostoverlaytest/copy_up_testfile.txt index e4188c841..e4188c841 100644 --- a/images/basic/hostoverlaytest/testfile.txt +++ b/images/basic/hostoverlaytest/copy_up_testfile.txt diff --git a/images/basic/hostoverlaytest/test.c b/images/basic/hostoverlaytest/test_copy_up.c index 088f90746..010b261dc 100644 --- a/images/basic/hostoverlaytest/test.c +++ b/images/basic/hostoverlaytest/test_copy_up.c @@ -6,7 +6,7 @@ #include <unistd.h> int main(int argc, char** argv) { - const char kTestFilePath[] = "testfile.txt"; + const char kTestFilePath[] = "copy_up_testfile.txt"; const char kOldFileData[] = "old data\n"; const char kNewFileData[] = "new data\n"; const size_t kPageSize = sysconf(_SC_PAGE_SIZE); diff --git a/images/basic/hostoverlaytest/test_rewinddir.c b/images/basic/hostoverlaytest/test_rewinddir.c new file mode 100644 index 000000000..f1a4085e1 --- /dev/null +++ b/images/basic/hostoverlaytest/test_rewinddir.c @@ -0,0 +1,78 @@ +#include <dirent.h> +#include <err.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> + +int main(int argc, char** argv) { + const char kDirPath[] = "rewinddir_test_dir"; + const char kFileBasename[] = "rewinddir_test_file"; + + // Create the test directory. + if (mkdir(kDirPath, 0755) < 0) { + err(1, "mkdir(%s)", kDirPath); + } + + // The test directory should initially be empty. + DIR* dir = opendir(kDirPath); + if (!dir) { + err(1, "opendir(%s)", kDirPath); + } + int failed = 0; + while (1) { + errno = 0; + struct dirent* d = readdir(dir); + if (!d) { + if (errno != 0) { + err(1, "readdir"); + } + break; + } + if (strcmp(d->d_name, ".") != 0 && strcmp(d->d_name, "..") != 0) { + warnx("unexpected file %s in new directory", d->d_name); + failed = 1; + } + } + + // Create a file in the test directory. + char* file_path = malloc(strlen(kDirPath) + 1 + strlen(kFileBasename)); + if (!file_path) { + errx(1, "malloc"); + } + strcpy(file_path, kDirPath); + file_path[strlen(kDirPath)] = '/'; + strcpy(file_path + strlen(kDirPath) + 1, kFileBasename); + if (mknod(file_path, 0644, 0) < 0) { + err(1, "mknod(%s)", file_path); + } + + // After rewinddir(), re-reading the directory stream should yield the new + // file. + rewinddir(dir); + size_t found_file = 0; + while (1) { + errno = 0; + struct dirent* d = readdir(dir); + if (!d) { + if (errno != 0) { + err(1, "readdir"); + } + break; + } + if (strcmp(d->d_name, kFileBasename) == 0) { + found_file++; + } else if (strcmp(d->d_name, ".") != 0 && strcmp(d->d_name, "..") != 0) { + warnx("unexpected file %s in new directory", d->d_name); + failed = 1; + } + } + if (found_file != 1) { + warnx("readdir returned file %s %zu times, wanted 1", kFileBasename, + found_file); + failed = 1; + } + + return failed; +} diff --git a/images/basic/linktest/Dockerfile b/images/basic/linktest/Dockerfile new file mode 100644 index 000000000..baebc9b76 --- /dev/null +++ b/images/basic/linktest/Dockerfile @@ -0,0 +1,7 @@ +FROM ubuntu:bionic + +WORKDIR /root +COPY . . + +RUN apt-get update && apt-get install -y gcc +RUN gcc -O2 -o link_test link_test.c diff --git a/images/basic/linktest/link_test.c b/images/basic/linktest/link_test.c new file mode 100644 index 000000000..45ab00abe --- /dev/null +++ b/images/basic/linktest/link_test.c @@ -0,0 +1,93 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <err.h> +#include <fcntl.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +// Basic test for linkat(2). Syscall tests requires CAP_DAC_READ_SEARCH and it +// cannot use tricks like userns as root. For this reason, run a basic link test +// to ensure some coverage. +int main(int argc, char** argv) { + const char kOldPath[] = "old.txt"; + int fd = open(kOldPath, O_RDWR | O_CREAT | O_TRUNC, 0600); + if (fd < 0) { + errx(1, "open(%s) failed", kOldPath); + } + const char kData[] = "some random content"; + if (write(fd, kData, sizeof(kData)) < 0) { + err(1, "write failed"); + } + close(fd); + + struct stat old_stat; + if (stat(kOldPath, &old_stat)) { + errx(1, "stat(%s) failed", kOldPath); + } + + const char kNewPath[] = "new.txt"; + if (link(kOldPath, kNewPath)) { + errx(1, "link(%s, %s) failed", kOldPath, kNewPath); + } + + struct stat new_stat; + if (stat(kNewPath, &new_stat)) { + errx(1, "stat(%s) failed", kNewPath); + } + + // Check that files are the same. + if (old_stat.st_dev != new_stat.st_dev) { + errx(1, "files st_dev is different, want: %lu, got: %lu", old_stat.st_dev, + new_stat.st_dev); + } + if (old_stat.st_ino != new_stat.st_ino) { + errx(1, "files st_ino is different, want: %lu, got: %lu", old_stat.st_ino, + new_stat.st_ino); + } + + // Check that link count is correct. + if (new_stat.st_nlink != old_stat.st_nlink + 1) { + errx(1, "wrong nlink, want: %lu, got: %lu", old_stat.st_nlink + 1, + new_stat.st_nlink); + } + + // Check taht contents are the same. + fd = open(kNewPath, O_RDONLY); + if (fd < 0) { + errx(1, "open(%s) failed", kNewPath); + } + char buf[sizeof(kData)] = {}; + if (read(fd, buf, sizeof(buf)) < 0) { + err(1, "read failed"); + } + close(fd); + + if (strcmp(buf, kData) != 0) { + errx(1, "file content mismatch: %s", buf); + } + + // Cleanup. + if (unlink(kNewPath)) { + errx(1, "unlink(%s) failed", kNewPath); + } + if (unlink(kOldPath)) { + errx(1, "unlink(%s) failed", kOldPath); + } + + // Success! + return 0; +} diff --git a/benchmarks/workloads/true/Dockerfile b/images/benchmarks/alpine/Dockerfile index 2e97c921e..b09b037ca 100644 --- a/benchmarks/workloads/true/Dockerfile +++ b/images/benchmarks/alpine/Dockerfile @@ -1,3 +1 @@ FROM alpine:latest - -CMD ["true"] diff --git a/benchmarks/workloads/nginx/Dockerfile b/images/benchmarks/nginx/Dockerfile index b64eb52ae..b64eb52ae 100644 --- a/benchmarks/workloads/nginx/Dockerfile +++ b/images/benchmarks/nginx/Dockerfile diff --git a/benchmarks/workloads/ruby_template/Dockerfile b/images/benchmarks/ruby/Dockerfile index a06d68bf4..13c4f6eed 100755 --- a/benchmarks/workloads/ruby_template/Dockerfile +++ b/images/benchmarks/ruby/Dockerfile @@ -1,5 +1,4 @@ # example based on https://github.com/errm/fib - FROM alpine:3.9 as build COPY Gemfile Gemfile.lock ./ @@ -23,16 +22,6 @@ RUN apk add --no-cache ruby ruby-json ruby-etc redis apache2-utils \ ).generate_bin \ end" -WORKDIR /app COPY . /app/. -ENV PORT=9292 \ - WEB_CONCURRENCY=20 \ - WEB_MAX_THREADS=20 \ - RACK_ENV=production - -ENV host localhost -EXPOSE $PORT -USER nobody STOPSIGNAL SIGINT -CMD ["sh", "-c", "/usr/bin/puma", "${host}"] diff --git a/benchmarks/workloads/ruby_template/Gemfile b/images/benchmarks/ruby/Gemfile index ac521b32c..ac521b32c 100755 --- a/benchmarks/workloads/ruby_template/Gemfile +++ b/images/benchmarks/ruby/Gemfile diff --git a/benchmarks/workloads/ruby_template/Gemfile.lock b/images/benchmarks/ruby/Gemfile.lock index eeb3c7bbe..041778e02 100644 --- a/benchmarks/workloads/ruby_template/Gemfile.lock +++ b/images/benchmarks/ruby/Gemfile.lock @@ -2,7 +2,7 @@ GEM remote: https://rubygems.org/ specs: mustermann (1.0.3) - puma (3.12.6) + puma (3.4.0) rack (2.0.6) rack-protection (2.0.5) rack diff --git a/benchmarks/workloads/ruby_template/config.ru b/images/benchmarks/ruby/config.ru index b2d135cc0..b2d135cc0 100755 --- a/benchmarks/workloads/ruby_template/config.ru +++ b/images/benchmarks/ruby/config.ru diff --git a/benchmarks/workloads/ruby_template/index.erb b/images/benchmarks/ruby/index.erb index 7f7300e80..7f7300e80 100755 --- a/benchmarks/workloads/ruby_template/index.erb +++ b/images/benchmarks/ruby/index.erb diff --git a/benchmarks/workloads/ruby_template/main.rb b/images/benchmarks/ruby/main.rb index 35c239377..b998f004e 100755 --- a/benchmarks/workloads/ruby_template/main.rb +++ b/images/benchmarks/ruby/main.rb @@ -2,7 +2,7 @@ require "sinatra" require "securerandom" require "redis" -redis_host = ENV["host"] +redis_host = ENV["HOST"] $redis = Redis.new(host: redis_host) def generateText @@ -24,4 +24,4 @@ get "/" do texts.push($redis.get(rand(0..99))) end template.result_with_hash(text: texts) -end
\ No newline at end of file +end diff --git a/images/benchmarks/runsc/Dockerfile b/images/benchmarks/runsc/Dockerfile new file mode 100644 index 000000000..6c3aafa57 --- /dev/null +++ b/images/benchmarks/runsc/Dockerfile @@ -0,0 +1,24 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + wget \ + git \ + pkg-config \ + zip \ + g++ \ + zlib1g-dev \ + unzip \ + python-minimal \ + python3 \ + python3-pip \ + && rm -rf /var/lib/apt/lists/* +RUN wget https://github.com/bazelbuild/bazel/releases/download/3.4.1/bazel-3.4.1-installer-linux-x86_64.sh +RUN chmod +x bazel-3.4.1-installer-linux-x86_64.sh +RUN ./bazel-3.4.1-installer-linux-x86_64.sh + +# Download release-20200601.0 +RUN mkdir gvisor && cd gvisor \ + && git init && git remote add origin https://github.com/google/gvisor.git \ + && git fetch --depth 1 origin a9b47390c821942d60784e308f681f213645049c && git checkout FETCH_HEAD diff --git a/images/benchmarks/sysbench/Dockerfile b/images/benchmarks/sysbench/Dockerfile new file mode 100644 index 000000000..55e865f43 --- /dev/null +++ b/images/benchmarks/sysbench/Dockerfile @@ -0,0 +1,7 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + sysbench \ + && rm -rf /var/lib/apt/lists/* diff --git a/images/benchmarks/util/Dockerfile b/images/benchmarks/util/Dockerfile new file mode 100644 index 000000000..f2799b3e6 --- /dev/null +++ b/images/benchmarks/util/Dockerfile @@ -0,0 +1,3 @@ +FROM ubuntu:bionic + +RUN apt-get update && apt-get install -y wget diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go index d3ebbccc4..5c6ffe4a3 100644 --- a/pkg/abi/linux/fuse.go +++ b/pkg/abi/linux/fuse.go @@ -141,3 +141,108 @@ type FUSEWriteIn struct { _ uint32 } + +// FUSE_INIT flags, consistent with the ones in include/uapi/linux/fuse.h. +const ( + FUSE_ASYNC_READ = 1 << 0 + FUSE_POSIX_LOCKS = 1 << 1 + FUSE_FILE_OPS = 1 << 2 + FUSE_ATOMIC_O_TRUNC = 1 << 3 + FUSE_EXPORT_SUPPORT = 1 << 4 + FUSE_BIG_WRITES = 1 << 5 + FUSE_DONT_MASK = 1 << 6 + FUSE_SPLICE_WRITE = 1 << 7 + FUSE_SPLICE_MOVE = 1 << 8 + FUSE_SPLICE_READ = 1 << 9 + FUSE_FLOCK_LOCKS = 1 << 10 + FUSE_HAS_IOCTL_DIR = 1 << 11 + FUSE_AUTO_INVAL_DATA = 1 << 12 + FUSE_DO_READDIRPLUS = 1 << 13 + FUSE_READDIRPLUS_AUTO = 1 << 14 + FUSE_ASYNC_DIO = 1 << 15 + FUSE_WRITEBACK_CACHE = 1 << 16 + FUSE_NO_OPEN_SUPPORT = 1 << 17 + FUSE_PARALLEL_DIROPS = 1 << 18 + FUSE_HANDLE_KILLPRIV = 1 << 19 + FUSE_POSIX_ACL = 1 << 20 + FUSE_ABORT_ERROR = 1 << 21 + FUSE_MAX_PAGES = 1 << 22 + FUSE_CACHE_SYMLINKS = 1 << 23 + FUSE_NO_OPENDIR_SUPPORT = 1 << 24 + FUSE_EXPLICIT_INVAL_DATA = 1 << 25 + FUSE_MAP_ALIGNMENT = 1 << 26 +) + +// currently supported FUSE protocol version numbers. +const ( + FUSE_KERNEL_VERSION = 7 + FUSE_KERNEL_MINOR_VERSION = 31 +) + +// FUSEInitIn is the request sent by the kernel to the daemon, +// to negotiate the version and flags. +// +// +marshal +type FUSEInitIn struct { + // Major version supported by kernel. + Major uint32 + + // Minor version supported by the kernel. + Minor uint32 + + // MaxReadahead is the maximum number of bytes to read-ahead + // decided by the kernel. + MaxReadahead uint32 + + // Flags of this init request. + Flags uint32 +} + +// FUSEInitOut is the reply sent by the daemon to the kernel +// for FUSEInitIn. +// +// +marshal +type FUSEInitOut struct { + // Major version supported by daemon. + Major uint32 + + // Minor version supported by daemon. + Minor uint32 + + // MaxReadahead is the maximum number of bytes to read-ahead. + // Decided by the daemon, after receiving the value from kernel. + MaxReadahead uint32 + + // Flags of this init reply. + Flags uint32 + + // MaxBackground is the maximum number of pending background requests + // that the daemon wants. + MaxBackground uint16 + + // CongestionThreshold is the daemon-decided threshold for + // the number of the pending background requests. + CongestionThreshold uint16 + + // MaxWrite is the daemon's maximum size of a write buffer. + // Kernel adjusts it to the minimum (fuse/init.go:fuseMinMaxWrite). + // if the value from daemon is too small. + MaxWrite uint32 + + // TimeGran is the daemon's time granularity for mtime and ctime metadata. + // The unit is nanosecond. + // Value should be power of 10. + // 1 indicates full nanosecond granularity support. + TimeGran uint32 + + // MaxPages is the daemon's maximum number of pages for one write operation. + // Kernel adjusts it to the maximum (fuse/init.go:FUSE_MAX_MAX_PAGES). + // if the value from daemon is too large. + MaxPages uint16 + + // MapAlignment is an unknown field and not used by this package at this moment. + // Use as a placeholder to be consistent with the FUSE protocol. + MapAlignment uint16 + + _ [8]uint32 +} diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index a91f9f018..9c27f7bb2 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -59,7 +59,7 @@ var VerdictStrings = map[int32]string{ NF_RETURN: "RETURN", } -// Socket options. These correspond to values in +// Socket options for SOL_SOCKET. These correspond to values in // include/uapi/linux/netfilter_ipv4/ip_tables.h. const ( IPT_BASE_CTL = 64 @@ -74,6 +74,12 @@ const ( IPT_SO_GET_MAX = IPT_SO_GET_REVISION_TARGET ) +// Socket option for SOL_IP. This corresponds to the value in +// include/uapi/linux/netfilter_ipv4.h. +const ( + SO_ORIGINAL_DST = 80 +) + // Name lengths. These correspond to values in // include/uapi/linux/netfilter/x_tables.h. const ( diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go index c24a8216e..693996c01 100644 --- a/pkg/abi/linux/socket.go +++ b/pkg/abi/linux/socket.go @@ -83,7 +83,6 @@ const ( MSG_MORE = 0x8000 MSG_WAITFORONE = 0x10000 MSG_SENDPAGE_NOTLAST = 0x20000 - MSG_REINJECT = 0x8000000 MSG_ZEROCOPY = 0x4000000 MSG_FASTOPEN = 0x20000000 MSG_CMSG_CLOEXEC = 0x40000000 @@ -239,11 +238,13 @@ const SockAddrMax = 128 type InetAddr [4]byte // SockAddrInet is struct sockaddr_in, from uapi/linux/in.h. +// +// +marshal type SockAddrInet struct { Family uint16 Port uint16 Addr InetAddr - Zero [8]uint8 // pad to sizeof(struct sockaddr). + _ [8]uint8 // pad to sizeof(struct sockaddr). } // InetMulticastRequest is struct ip_mreq, from uapi/linux/in.h. diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD index 74affc887..9888cce9c 100644 --- a/pkg/refs/BUILD +++ b/pkg/refs/BUILD @@ -24,6 +24,7 @@ go_library( ], visibility = ["//:sandbox"], deps = [ + "//pkg/context", "//pkg/log", "//pkg/sync", ], @@ -34,5 +35,8 @@ go_test( size = "small", srcs = ["refcounter_test.go"], library = ":refs", - deps = ["//pkg/sync"], + deps = [ + "//pkg/context", + "//pkg/sync", + ], ) diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index c45ba8200..3f39edb66 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -23,6 +23,7 @@ import ( "runtime" "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" ) @@ -38,7 +39,7 @@ type RefCounter interface { // Note that AtomicRefCounter.DecRef() does not support destructors. // If a type has a destructor, it must implement its own DecRef() // method and call AtomicRefCounter.DecRefWithDestructor(destructor). - DecRef() + DecRef(ctx context.Context) // TryIncRef attempts to increase the reference counter on the object, // but may fail if all references have already been dropped. This @@ -57,7 +58,7 @@ type RefCounter interface { // A WeakRefUser is notified when the last non-weak reference is dropped. type WeakRefUser interface { // WeakRefGone is called when the last non-weak reference is dropped. - WeakRefGone() + WeakRefGone(ctx context.Context) } // WeakRef is a weak reference. @@ -123,7 +124,7 @@ func (w *WeakRef) Get() RefCounter { // Drop drops this weak reference. You should always call drop when you are // finished with the weak reference. You may not use this object after calling // drop. -func (w *WeakRef) Drop() { +func (w *WeakRef) Drop(ctx context.Context) { rc, ok := w.get() if !ok { // We've been zapped already. When the refcounter has called @@ -145,7 +146,7 @@ func (w *WeakRef) Drop() { // And now aren't on the object's list of weak references. So it won't // zap us if this causes the reference count to drop to zero. - rc.DecRef() + rc.DecRef(ctx) // Return to the pool. weakRefPool.Put(w) @@ -214,6 +215,8 @@ type AtomicRefCount struct { // LeakMode configures the leak checker. type LeakMode uint32 +// TODO(gvisor.dev/issue/1624): Simplify down to two modes once vfs1 ref +// counting is gone. const ( // UninitializedLeakChecking indicates that the leak checker has not yet been initialized. UninitializedLeakChecking LeakMode = iota @@ -243,6 +246,11 @@ func SetLeakMode(mode LeakMode) { atomic.StoreUint32(&leakMode, uint32(mode)) } +// GetLeakMode returns the current leak mode. +func GetLeakMode() LeakMode { + return LeakMode(atomic.LoadUint32(&leakMode)) +} + const maxStackFrames = 40 type fileLine struct { @@ -427,7 +435,7 @@ func (r *AtomicRefCount) dropWeakRef(w *WeakRef) { // A: TryIncRef [transform speculative to real] // //go:nosplit -func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { +func (r *AtomicRefCount) DecRefWithDestructor(ctx context.Context, destroy func(context.Context)) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: panic("Decrementing non-positive ref count") @@ -448,7 +456,7 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { if user != nil { r.mu.Unlock() - user.WeakRefGone() + user.WeakRefGone(ctx) r.mu.Lock() } } @@ -456,7 +464,7 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { // Call the destructor. if destroy != nil { - destroy() + destroy(ctx) } } } @@ -464,6 +472,6 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { // DecRef decrements this object's reference count. // //go:nosplit -func (r *AtomicRefCount) DecRef() { - r.DecRefWithDestructor(nil) +func (r *AtomicRefCount) DecRef(ctx context.Context) { + r.DecRefWithDestructor(ctx, nil) } diff --git a/pkg/refs/refcounter_test.go b/pkg/refs/refcounter_test.go index 1ab4a4440..6d0dd1018 100644 --- a/pkg/refs/refcounter_test.go +++ b/pkg/refs/refcounter_test.go @@ -18,6 +18,7 @@ import ( "reflect" "testing" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -31,11 +32,11 @@ type testCounter struct { destroyed bool } -func (t *testCounter) DecRef() { - t.AtomicRefCount.DecRefWithDestructor(t.destroy) +func (t *testCounter) DecRef(ctx context.Context) { + t.AtomicRefCount.DecRefWithDestructor(ctx, t.destroy) } -func (t *testCounter) destroy() { +func (t *testCounter) destroy(context.Context) { t.mu.Lock() defer t.mu.Unlock() t.destroyed = true @@ -53,7 +54,7 @@ func newTestCounter() *testCounter { func TestOneRef(t *testing.T) { tc := newTestCounter() - tc.DecRef() + tc.DecRef(context.Background()) if !tc.IsDestroyed() { t.Errorf("object should have been destroyed") @@ -63,8 +64,9 @@ func TestOneRef(t *testing.T) { func TestTwoRefs(t *testing.T) { tc := newTestCounter() tc.IncRef() - tc.DecRef() - tc.DecRef() + ctx := context.Background() + tc.DecRef(ctx) + tc.DecRef(ctx) if !tc.IsDestroyed() { t.Errorf("object should have been destroyed") @@ -74,12 +76,13 @@ func TestTwoRefs(t *testing.T) { func TestMultiRefs(t *testing.T) { tc := newTestCounter() tc.IncRef() - tc.DecRef() + ctx := context.Background() + tc.DecRef(ctx) tc.IncRef() - tc.DecRef() + tc.DecRef(ctx) - tc.DecRef() + tc.DecRef(ctx) if !tc.IsDestroyed() { t.Errorf("object should have been destroyed") @@ -89,19 +92,20 @@ func TestMultiRefs(t *testing.T) { func TestWeakRef(t *testing.T) { tc := newTestCounter() w := NewWeakRef(tc, nil) + ctx := context.Background() // Try resolving. if x := w.Get(); x == nil { t.Errorf("weak reference didn't resolve: expected %v, got nil", tc) } else { - x.DecRef() + x.DecRef(ctx) } // Try resolving again. if x := w.Get(); x == nil { t.Errorf("weak reference didn't resolve: expected %v, got nil", tc) } else { - x.DecRef() + x.DecRef(ctx) } // Shouldn't be destroyed yet. (Can't continue if this fails.) @@ -110,7 +114,7 @@ func TestWeakRef(t *testing.T) { } // Drop the original reference. - tc.DecRef() + tc.DecRef(ctx) // Assert destroyed. if !tc.IsDestroyed() { @@ -126,7 +130,8 @@ func TestWeakRef(t *testing.T) { func TestWeakRefDrop(t *testing.T) { tc := newTestCounter() w := NewWeakRef(tc, nil) - w.Drop() + ctx := context.Background() + w.Drop(ctx) // Just assert the list is empty. if !tc.weakRefs.Empty() { @@ -134,14 +139,14 @@ func TestWeakRefDrop(t *testing.T) { } // Drop the original reference. - tc.DecRef() + tc.DecRef(ctx) } type testWeakRefUser struct { weakRefGone func() } -func (u *testWeakRefUser) WeakRefGone() { +func (u *testWeakRefUser) WeakRefGone(ctx context.Context) { u.weakRefGone() } @@ -165,7 +170,8 @@ func TestCallback(t *testing.T) { }}) // Drop the original reference, this must trigger the callback. - tc.DecRef() + ctx := context.Background() + tc.DecRef(ctx) if !called { t.Fatalf("Callback not called") diff --git a/pkg/refs_vfs2/BUILD b/pkg/refs_vfs2/BUILD new file mode 100644 index 000000000..7f180c7bd --- /dev/null +++ b/pkg/refs_vfs2/BUILD @@ -0,0 +1,28 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template") + +package(licenses = ["notice"]) + +go_template( + name = "refs_template", + srcs = [ + "refs_template.go", + ], + types = [ + "T", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/log", + "//pkg/refs", + ], +) + +go_library( + name = "refs", + srcs = [ + "refs.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = ["//pkg/context"], +) diff --git a/pkg/refs_vfs2/refs.go b/pkg/refs_vfs2/refs.go new file mode 100644 index 000000000..ee01b17b0 --- /dev/null +++ b/pkg/refs_vfs2/refs.go @@ -0,0 +1,36 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package refs defines an interface for a reference-counted object. +package refs + +import ( + "gvisor.dev/gvisor/pkg/context" +) + +// RefCounter is the interface to be implemented by objects that are reference +// counted. +type RefCounter interface { + // IncRef increments the reference counter on the object. + IncRef() + + // DecRef decrements the object's reference count. Users of refs_template.Refs + // may specify a destructor to be called once the reference count reaches zero. + DecRef(ctx context.Context) + + // TryIncRef attempts to increment the reference count, but may fail if all + // references have already been dropped, in which case it returns false. If + // true is returned, then a valid reference is now held on the object. + TryIncRef() bool +} diff --git a/pkg/refs_vfs2/refs_template.go b/pkg/refs_vfs2/refs_template.go new file mode 100644 index 000000000..3e5b458c7 --- /dev/null +++ b/pkg/refs_vfs2/refs_template.go @@ -0,0 +1,133 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package refs_template defines a template that can be used by reference counted +// objects. +package refs_template + +import ( + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// T is the type of the reference counted object. It is only used to customize +// debug output when leak checking. +type T interface{} + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var ownerType *T + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// +stateify savable +type Refs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *Refs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sAtomicRefCount %p owned by %T garbage collected with ref count of %d (want 0)", note, r, ownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *Refs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*Refs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *Refs) ReadRefs() int64 { + // Account for the internal -1 offset on refcounts. + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *Refs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic("Incrementing non-positive ref count") + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *Refs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + // This object has already been freed. + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + // Turn into a real reference. + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *Refs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic("Decrementing non-positive ref count") + + case v == -1: + // Call the destructor. + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go index 663e51989..2bf3c45e1 100644 --- a/pkg/sentry/control/pprof.go +++ b/pkg/sentry/control/pprof.go @@ -49,6 +49,9 @@ type ProfileOpts struct { // - dump out the stack trace of current go routines. // sentryctl -pid <pid> pprof-goroutine type Profile struct { + // Kernel is the kernel under profile. It's immutable. + Kernel *kernel.Kernel + // mu protects the fields below. mu sync.Mutex @@ -57,9 +60,6 @@ type Profile struct { // traceFile is the current execution trace output file. traceFile *fd.FD - - // Kernel is the kernel under profile. - Kernel *kernel.Kernel } // StartCPUProfile is an RPC stub which starts recording the CPU profile in a diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 1bae7cfaf..dfa936563 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -139,7 +139,6 @@ func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Import file descriptors. fdTable := proc.Kernel.NewFDTable() - defer fdTable.DecRef() creds := auth.NewUserCredentials( args.KUID, @@ -177,6 +176,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI initArgs.MountNamespaceVFS2.IncRef() } ctx := initArgs.NewContext(proc.Kernel) + defer fdTable.DecRef(ctx) if kernel.VFS2Enabled { // Get the full path to the filename from the PATH env variable. diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go index af66fe4dc..511179e31 100644 --- a/pkg/sentry/devices/memdev/full.go +++ b/pkg/sentry/devices/memdev/full.go @@ -46,7 +46,7 @@ type fullFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *fullFD) Release() { +func (fd *fullFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go index 92d3d71be..4918dbeeb 100644 --- a/pkg/sentry/devices/memdev/null.go +++ b/pkg/sentry/devices/memdev/null.go @@ -47,7 +47,7 @@ type nullFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *nullFD) Release() { +func (fd *nullFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go index 6b81da5ef..5e7fe0280 100644 --- a/pkg/sentry/devices/memdev/random.go +++ b/pkg/sentry/devices/memdev/random.go @@ -56,7 +56,7 @@ type randomFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *randomFD) Release() { +func (fd *randomFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go index c6f15054d..2e631a252 100644 --- a/pkg/sentry/devices/memdev/zero.go +++ b/pkg/sentry/devices/memdev/zero.go @@ -48,7 +48,7 @@ type zeroFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *zeroFD) Release() { +func (fd *zeroFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go index fbb7fd92c..fd4b79c46 100644 --- a/pkg/sentry/devices/ttydev/ttydev.go +++ b/pkg/sentry/devices/ttydev/ttydev.go @@ -55,7 +55,7 @@ type ttyFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *ttyFD) Release() {} +func (fd *ttyFD) Release(context.Context) {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go index dfbd069af..a40625e19 100644 --- a/pkg/sentry/devices/tundev/tundev.go +++ b/pkg/sentry/devices/tundev/tundev.go @@ -108,8 +108,8 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *tunFD) Release() { - fd.device.Release() +func (fd *tunFD) Release(ctx context.Context) { + fd.device.Release(ctx) } // PRead implements vfs.FileDescriptionImpl.PRead. @@ -160,8 +160,8 @@ func (fd *tunFD) EventUnregister(e *waiter.Entry) { fd.device.EventUnregister(e) } -// isNetTunSupported returns whether /dev/net/tun device is supported for s. -func isNetTunSupported(s inet.Stack) bool { +// IsNetTunSupported returns whether /dev/net/tun device is supported for s. +func IsNetTunSupported(s inet.Stack) bool { _, ok := s.(*netstack.Stack) return ok } diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go index a4199f9e9..1b7cb94c0 100644 --- a/pkg/sentry/fdimport/fdimport.go +++ b/pkg/sentry/fdimport/fdimport.go @@ -15,6 +15,8 @@ package fdimport import ( + "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" @@ -48,7 +50,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) // Remember this in the TTY file, as we will // use it for the other stdio FDs. @@ -67,7 +69,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) } // Add the file to the FD map. @@ -84,6 +86,9 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) { k := kernel.KernelFromContext(ctx) + if k == nil { + return nil, fmt.Errorf("cannot find kernel from context") + } var ttyFile *vfs.FileDescription for appFD, hostFD := range stdioFDs { @@ -97,7 +102,7 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) // Remember this in the TTY file, as we will use it for the other stdio // FDs. @@ -114,7 +119,7 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) } if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index ab1424c95..735452b07 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -201,7 +201,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { parentUpper := parent.Inode.overlay.upper root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } // Create the file in the upper filesystem and get an Inode for it. @@ -212,7 +212,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { log.Warningf("copy up failed to create file: %v", err) return syserror.EIO } - defer childFile.DecRef() + defer childFile.DecRef(ctx) childUpperInode = childFile.Dirent.Inode case Directory: @@ -226,7 +226,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } - defer childUpper.DecRef() + defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode case Symlink: @@ -246,7 +246,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } - defer childUpper.DecRef() + defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode default: @@ -352,14 +352,14 @@ func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size in if err != nil { return err } - defer upperFile.DecRef() + defer upperFile.DecRef(ctx) // Get a handle to the lower filesystem, which we will read from. lowerFile, err := overlayFile(ctx, lower, FileFlags{Read: true}) if err != nil { return err } - defer lowerFile.DecRef() + defer lowerFile.DecRef(ctx) // Use a buffer pool to minimize allocations. buf := copyUpBuffers.Get().([]byte) diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go index 91792d9fe..c7a11eec1 100644 --- a/pkg/sentry/fs/copy_up_test.go +++ b/pkg/sentry/fs/copy_up_test.go @@ -126,7 +126,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile { if err != nil { t.Fatalf("failed to create file %q: %v", name, err) } - defer f.DecRef() + defer f.DecRef(ctx) relname, _ := f.Dirent.FullName(lowerRoot) @@ -171,7 +171,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile { if err != nil { t.Fatalf("failed to find %q: %v", f.name, err) } - defer d.DecRef() + defer d.DecRef(ctx) f.File, err = d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) if err != nil { diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go index dc7ad075a..ec474e554 100644 --- a/pkg/sentry/fs/dev/net_tun.go +++ b/pkg/sentry/fs/dev/net_tun.go @@ -80,8 +80,8 @@ type netTunFileOperations struct { var _ fs.FileOperations = (*netTunFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (fops *netTunFileOperations) Release() { - fops.device.Release() +func (fops *netTunFileOperations) Release(ctx context.Context) { + fops.device.Release(ctx) } // Ioctl implements fs.FileOperations.Ioctl. diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 65be12175..a2f751068 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -325,7 +325,7 @@ func (d *Dirent) SyncAll(ctx context.Context) { for _, w := range d.children { if child := w.Get(); child != nil { child.(*Dirent).SyncAll(ctx) - child.DecRef() + child.DecRef(ctx) } } } @@ -451,7 +451,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // which don't hold a hard reference on their parent (their parent holds a // hard reference on them, and they contain virtually no state). But this is // good house-keeping. - child.DecRef() + child.DecRef(ctx) return nil, syscall.ENOENT } @@ -468,20 +468,20 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // their pins on the child. Inotify doesn't properly support filesystems that // revalidate dirents (since watches are lost on revalidation), but if we fail // to unpin the watches child will never be GCed. - cd.Inode.Watches.Unpin(cd) + cd.Inode.Watches.Unpin(ctx, cd) // This child needs to be revalidated, fallthrough to unhash it. Make sure // to not leak a reference from Get(). // // Note that previous lookups may still have a reference to this stale child; // this can't be helped, but we can ensure that *new* lookups are up-to-date. - child.DecRef() + child.DecRef(ctx) } // Either our weak reference expired or we need to revalidate it. Unhash child first, we're // about to replace it. delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be @@ -512,12 +512,12 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // There are active references to the existing child, prefer it to the one we // retrieved from Lookup. Likely the Lookup happened very close to the insertion // of child, so considering one stale over the other is fairly arbitrary. - c.DecRef() + c.DecRef(ctx) // The child that was installed could be negative. if cd.IsNegative() { // If so, don't leak a reference and short circuit. - child.DecRef() + child.DecRef(ctx) return nil, syscall.ENOENT } @@ -531,7 +531,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child // we looked up. delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Give the looked up child a parent. We cannot kick out entries, since we just checked above @@ -587,7 +587,7 @@ func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool { return false } // Child exists. - child.DecRef() + child.DecRef(ctx) return true } @@ -622,7 +622,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi } child := file.Dirent - d.finishCreate(child, name) + d.finishCreate(ctx, child, name) // Return the reference and the new file. When the last reference to // the file is dropped, file.Dirent may no longer be cached. @@ -631,7 +631,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi // finishCreate validates the created file, adds it as a child of this dirent, // and notifies any watchers. -func (d *Dirent) finishCreate(child *Dirent, name string) { +func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) { // Sanity check c, its name must be consistent. if child.name != name { panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name)) @@ -650,14 +650,14 @@ func (d *Dirent) finishCreate(child *Dirent, name string) { panic(fmt.Sprintf("hashed child %q over a positive child", child.name)) } // Don't leak a reference. - old.DecRef() + old.DecRef(ctx) // Drop d's reference. - old.DecRef() + old.DecRef(ctx) } // Finally drop the useless weak reference on the floor. - w.Drop() + w.Drop(ctx) } d.Inode.Watches.Notify(name, linux.IN_CREATE, 0) @@ -686,17 +686,17 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name)) } // Don't leak a reference. - old.DecRef() + old.DecRef(ctx) // Drop d's reference. - old.DecRef() + old.DecRef(ctx) } // Unhash the negative Dirent, name needs to exist now. delete(d.children, name) // Finally drop the useless weak reference on the floor. - w.Drop() + w.Drop(ctx) } // Execute the create operation. @@ -756,7 +756,7 @@ func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data trans if e != nil { return e } - d.finishCreate(childDir, name) + d.finishCreate(ctx, childDir, name) return nil }) if err == syscall.EEXIST { @@ -901,7 +901,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, // references to children. // // Preconditions: d.mu must be held. -func (d *Dirent) flush() { +func (d *Dirent) flush(ctx context.Context) { expired := make(map[string]*refs.WeakRef) for n, w := range d.children { // Call flush recursively on each child before removing our @@ -912,7 +912,7 @@ func (d *Dirent) flush() { if !cd.IsNegative() { // Flush the child. cd.mu.Lock() - cd.flush() + cd.flush(ctx) cd.mu.Unlock() // Allow the file system to drop extra references on child. @@ -920,13 +920,13 @@ func (d *Dirent) flush() { } // Don't leak a reference. - child.DecRef() + child.DecRef(ctx) } // Check if the child dirent is closed, and mark it as expired if it is. // We must call w.Get() again here, since the child could have been closed // by the calls to flush() and cache.Remove() in the above if-block. if child := w.Get(); child != nil { - child.DecRef() + child.DecRef(ctx) } else { expired[n] = w } @@ -935,7 +935,7 @@ func (d *Dirent) flush() { // Remove expired entries. for n, w := range expired { delete(d.children, n) - w.Drop() + w.Drop(ctx) } } @@ -977,7 +977,7 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err if !ok { panic("mount must mount over an existing dirent") } - weakRef.Drop() + weakRef.Drop(ctx) // Note that even though `d` is now hidden, it still holds a reference // to its parent. @@ -1002,13 +1002,13 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error { if !ok { panic("mount must mount over an existing dirent") } - weakRef.Drop() + weakRef.Drop(ctx) // d is not reachable anymore, and hence not mounted anymore. d.mounted = false // Drop mount reference. - d.DecRef() + d.DecRef(ctx) return nil } @@ -1029,7 +1029,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath // Child does not exist. return err } - defer child.DecRef() + defer child.DecRef(ctx) // Remove cannot remove directories. if IsDir(child.Inode.StableAttr) { @@ -1055,7 +1055,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath atomic.StoreInt32(&child.deleted, 1) if w, ok := d.children[name]; ok { delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Allow the file system to drop extra references on child. @@ -1067,7 +1067,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath // inode may have other links. If this was the last link, the events for the // watch removal will be queued by the inode destructor. child.Inode.Watches.MarkUnlinked() - child.Inode.Watches.Unpin(child) + child.Inode.Watches.Unpin(ctx, child) d.Inode.Watches.Notify(name, linux.IN_DELETE, 0) return nil @@ -1100,7 +1100,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) // Child does not exist. return err } - defer child.DecRef() + defer child.DecRef(ctx) // RemoveDirectory can only remove directories. if !IsDir(child.Inode.StableAttr) { @@ -1121,7 +1121,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) atomic.StoreInt32(&child.deleted, 1) if w, ok := d.children[name]; ok { delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Allow the file system to drop extra references on child. @@ -1130,14 +1130,14 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) // Finally, let inotify know the child is being unlinked. Drop any extra // refs from inotify to this child dirent. child.Inode.Watches.MarkUnlinked() - child.Inode.Watches.Unpin(child) + child.Inode.Watches.Unpin(ctx, child) d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0) return nil } // destroy closes this node and all children. -func (d *Dirent) destroy() { +func (d *Dirent) destroy(ctx context.Context) { if d.IsNegative() { // Nothing to tear-down and no parent references to drop, since a negative // Dirent does not take a references on its parent, has no Inode and no children. @@ -1153,19 +1153,19 @@ func (d *Dirent) destroy() { if c.(*Dirent).IsNegative() { // The parent holds both weak and strong refs in the case of // negative dirents. - c.DecRef() + c.DecRef(ctx) } // Drop the reference we just acquired in WeakRef.Get. - c.DecRef() + c.DecRef(ctx) } - w.Drop() + w.Drop(ctx) } d.children = nil allDirents.remove(d) // Drop our reference to the Inode. - d.Inode.DecRef() + d.Inode.DecRef(ctx) // Allow the Dirent to be GC'ed after this point, since the Inode may still // be referenced after the Dirent is destroyed (for instance by filesystem @@ -1175,7 +1175,7 @@ func (d *Dirent) destroy() { // Drop the reference we have on our parent if we took one. renameMu doesn't need to be // held because d can't be reparented without any references to it left. if d.parent != nil { - d.parent.DecRef() + d.parent.DecRef(ctx) } } @@ -1201,14 +1201,14 @@ func (d *Dirent) TryIncRef() bool { // DecRef decreases the Dirent's refcount and drops its reference on its mount. // // DecRef implements RefCounter.DecRef with destructor d.destroy. -func (d *Dirent) DecRef() { +func (d *Dirent) DecRef(ctx context.Context) { if d.Inode != nil { // Keep mount around, since DecRef may destroy d.Inode. msrc := d.Inode.MountSource - d.DecRefWithDestructor(d.destroy) + d.DecRefWithDestructor(ctx, d.destroy) msrc.DecDirentRefs() } else { - d.DecRefWithDestructor(d.destroy) + d.DecRefWithDestructor(ctx, d.destroy) } } @@ -1359,7 +1359,7 @@ func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error if err != nil { return err } - defer victim.DecRef() + defer victim.DecRef(ctx) return d.mayDelete(ctx, victim) } @@ -1411,7 +1411,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string if err != nil { return err } - defer renamed.DecRef() + defer renamed.DecRef(ctx) // Check that the renamed dirent is deletable. if err := oldParent.mayDelete(ctx, renamed); err != nil { @@ -1453,13 +1453,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Check that we can delete replaced. if err := newParent.mayDelete(ctx, replaced); err != nil { - replaced.DecRef() + replaced.DecRef(ctx) return err } // Target should not be an ancestor of source. if oldParent.descendantOf(replaced) { - replaced.DecRef() + replaced.DecRef(ctx) // Note that Linux returns EINVAL if the source is an // ancestor of target, but ENOTEMPTY if the target is @@ -1470,7 +1470,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Check that replaced is not a mount point. if replaced.isMountPointLocked() { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.EBUSY } @@ -1478,11 +1478,11 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string oldIsDir := IsDir(renamed.Inode.StableAttr) newIsDir := IsDir(replaced.Inode.StableAttr) if !newIsDir && oldIsDir { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.ENOTDIR } if !oldIsDir && newIsDir { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.EISDIR } @@ -1493,13 +1493,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // open across renames is currently broken for multiple // reasons, so we flush all references on the replaced node and // its children. - replaced.Inode.Watches.Unpin(replaced) + replaced.Inode.Watches.Unpin(ctx, replaced) replaced.mu.Lock() - replaced.flush() + replaced.flush(ctx) replaced.mu.Unlock() // Done with replaced. - replaced.DecRef() + replaced.DecRef(ctx) } if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName, replaced != nil); err != nil { @@ -1513,14 +1513,14 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // can't destroy oldParent (and try to retake its lock) because // Rename's caller must be holding a reference. newParent.IncRef() - oldParent.DecRef() + oldParent.DecRef(ctx) } if w, ok := newParent.children[newName]; ok { - w.Drop() + w.Drop(ctx) delete(newParent.children, newName) } if w, ok := oldParent.children[oldName]; ok { - w.Drop() + w.Drop(ctx) delete(oldParent.children, oldName) } @@ -1551,7 +1551,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Same as replaced.flush above. renamed.mu.Lock() - renamed.flush() + renamed.flush(ctx) renamed.mu.Unlock() return nil diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go index 33de32c69..7d9dd717e 100644 --- a/pkg/sentry/fs/dirent_cache.go +++ b/pkg/sentry/fs/dirent_cache.go @@ -17,6 +17,7 @@ package fs import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -101,7 +102,7 @@ func (c *DirentCache) remove(d *Dirent) { panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d)) } c.list.Remove(d) - d.DecRef() + d.DecRef(context.Background()) c.currentSize-- if c.limit != nil { c.limit.dec() diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go index 98d69c6f2..176b894ba 100644 --- a/pkg/sentry/fs/dirent_refs_test.go +++ b/pkg/sentry/fs/dirent_refs_test.go @@ -51,7 +51,7 @@ func TestWalkPositive(t *testing.T) { t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 1) } - d.DecRef() + d.DecRef(ctx) if got := root.ReadRefs(); got != 1 { t.Fatalf("root has a ref count of %d, want %d", got, 1) @@ -61,7 +61,7 @@ func TestWalkPositive(t *testing.T) { t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0) } - root.flush() + root.flush(ctx) if got := len(root.children); got != 0 { t.Fatalf("root has %d children, want %d", got, 0) @@ -114,7 +114,7 @@ func TestWalkNegative(t *testing.T) { t.Fatalf("child has a ref count of %d, want %d", got, 2) } - child.DecRef() + child.DecRef(ctx) if got := child.(*Dirent).ReadRefs(); got != 1 { t.Fatalf("child has a ref count of %d, want %d", got, 1) @@ -124,7 +124,7 @@ func TestWalkNegative(t *testing.T) { t.Fatalf("root has %d children, want %d", got, 1) } - root.DecRef() + root.DecRef(ctx) if got := root.ReadRefs(); got != 0 { t.Fatalf("root has a ref count of %d, want %d", got, 0) @@ -351,9 +351,9 @@ func TestRemoveExtraRefs(t *testing.T) { t.Fatalf("dirent has a ref count of %d, want %d", got, 1) } - d.DecRef() + d.DecRef(ctx) - test.root.flush() + test.root.flush(ctx) if got := len(test.root.children); got != 0 { t.Errorf("root has %d children, want %d", got, 0) @@ -403,8 +403,8 @@ func TestRenameExtraRefs(t *testing.T) { t.Fatalf("Rename got error %v, want nil", err) } - oldParent.flush() - newParent.flush() + oldParent.flush(ctx) + newParent.flush(ctx) // Expect to have only active references. if got := renamed.ReadRefs(); got != 1 { diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go index f623d6c0e..67a35f0b2 100644 --- a/pkg/sentry/fs/dirent_state.go +++ b/pkg/sentry/fs/dirent_state.go @@ -18,6 +18,7 @@ import ( "fmt" "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" ) @@ -48,7 +49,7 @@ func (d *Dirent) saveChildren() map[string]*Dirent { for name, w := range d.children { if rc := w.Get(); rc != nil { // Drop the reference count obtain in w.Get() - rc.DecRef() + rc.DecRef(context.Background()) cd := rc.(*Dirent) if cd.IsNegative() { diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go index 9fce177ad..b99199798 100644 --- a/pkg/sentry/fs/fdpipe/pipe.go +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -115,7 +115,7 @@ func (p *pipeOperations) Readiness(mask waiter.EventMask) (eventMask waiter.Even } // Release implements fs.FileOperations.Release. -func (p *pipeOperations) Release() { +func (p *pipeOperations) Release(context.Context) { fdnotifier.RemoveFD(int32(p.file.FD())) p.file.Close() p.file = nil diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go index e556da48a..b9cec4b13 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -182,7 +182,7 @@ func TestTryOpen(t *testing.T) { // Cleanup the state of the pipe, and remove the fd from the // fdnotifier. Sadly this needed to maintain the correctness // of other tests because the fdnotifier is global. - pipeOps.Release() + pipeOps.Release(ctx) } continue } @@ -191,7 +191,7 @@ func TestTryOpen(t *testing.T) { } if pipeOps != nil { // Same as above. - pipeOps.Release() + pipeOps.Release(ctx) } } } @@ -279,7 +279,7 @@ func TestPipeOpenUnblocksEventually(t *testing.T) { pipeOps, err := Open(ctx, opener, flags) if pipeOps != nil { // Same as TestTryOpen. - pipeOps.Release() + pipeOps.Release(ctx) } // Check that the partner opened the file successfully. @@ -325,7 +325,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) { ctx := contexttest.Context(t) pipeOps, err := pipeOpenState.TryOpen(ctx, opener, fs.FileFlags{Read: true}) if pipeOps != nil { - pipeOps.Release() + pipeOps.Release(ctx) t.Fatalf("open(%s, %o) got file, want nil", name, syscall.O_RDONLY) } if err != syserror.ErrWouldBlock { @@ -351,7 +351,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) { if pipeOps == nil { t.Fatalf("open(%s, %o) got nil file, want not nil", name, syscall.O_RDONLY) } - defer pipeOps.Release() + defer pipeOps.Release(ctx) if err != nil { t.Fatalf("open(%s, %o) got error %v, want nil", name, syscall.O_RDONLY, err) @@ -471,14 +471,14 @@ func TestPipeHangup(t *testing.T) { f := <-fdchan if f < 0 { t.Errorf("%s: partner routine got fd %d, want > 0", test.desc, f) - pipeOps.Release() + pipeOps.Release(ctx) continue } if test.hangupSelf { // Hangup self and assert that our partner got the expected hangup // error. - pipeOps.Release() + pipeOps.Release(ctx) if test.flags.Read { // Partner is writer. @@ -490,7 +490,7 @@ func TestPipeHangup(t *testing.T) { } else { // Hangup our partner and expect us to get the hangup error. syscall.Close(f) - defer pipeOps.Release() + defer pipeOps.Release(ctx) if test.flags.Read { assertReaderHungup(t, test.desc, pipeOps.(*pipeOperations).file) diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go index a0082ecca..1c9e82562 100644 --- a/pkg/sentry/fs/fdpipe/pipe_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -98,10 +98,11 @@ func TestNewPipe(t *testing.T) { } f := fd.New(gfd) - p, err := newPipeOperations(contexttest.Context(t), nil, test.flags, f, test.readAheadBuffer) + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, test.flags, f, test.readAheadBuffer) if p != nil { // This is necessary to remove the fd from the global fd notifier. - defer p.Release() + defer p.Release(ctx) } else { // If there is no p to DecRef on, because newPipeOperations failed, then the // file still needs to be closed. @@ -153,13 +154,14 @@ func TestPipeDestruction(t *testing.T) { syscall.Close(fds[1]) // Test the read end, but it doesn't really matter which. - p, err := newPipeOperations(contexttest.Context(t), nil, fs.FileFlags{Read: true}, f, nil) + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, fs.FileFlags{Read: true}, f, nil) if err != nil { f.Close() t.Fatalf("newPipeOperations got error %v, want nil", err) } // Drop our only reference, which should trigger the destructor. - p.Release() + p.Release(ctx) if fdnotifier.HasFD(int32(fds[0])) { t.Fatalf("after DecRef fdnotifier has fd %d, want no longer registered", fds[0]) @@ -282,7 +284,7 @@ func TestPipeRequest(t *testing.T) { if err != nil { t.Fatalf("%s: newPipeOperations got error %v, want nil", test.desc, err) } - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe}) file := fs.NewFile(ctx, fs.NewDirent(ctx, inode, "pipe"), fs.FileFlags{Read: true}, p) @@ -334,7 +336,7 @@ func TestPipeReadAheadBuffer(t *testing.T) { rfile.Close() t.Fatalf("newPipeOperations got error %v, want nil", err) } - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, @@ -380,7 +382,7 @@ func TestPipeReadsAccumulate(t *testing.T) { } // Don't forget to remove the fd from the fd notifier. Otherwise other tests will // likely be borked, because it's global :( - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, @@ -448,7 +450,7 @@ func TestPipeWritesAccumulate(t *testing.T) { } // Don't forget to remove the fd from the fd notifier. Otherwise other tests // will likely be borked, because it's global :( - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index ca41520b4..72ea70fcf 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -142,17 +142,17 @@ func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOper } // DecRef destroys the File when it is no longer referenced. -func (f *File) DecRef() { - f.DecRefWithDestructor(func() { +func (f *File) DecRef(ctx context.Context) { + f.DecRefWithDestructor(ctx, func(context.Context) { // Drop BSD style locks. lockRng := lock.LockRange{Start: 0, End: lock.LockEOF} f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng) // Release resources held by the FileOperations. - f.FileOperations.Release() + f.FileOperations.Release(ctx) // Release a reference on the Dirent. - f.Dirent.DecRef() + f.Dirent.DecRef(ctx) // Only unregister if we are currently registered. There is nothing // to register if f.async is nil (this happens when async mode is @@ -460,7 +460,7 @@ func (f *File) UnstableAttr(ctx context.Context) (UnstableAttr, error) { func (f *File) MappedName(ctx context.Context) string { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } name, _ := f.Dirent.FullName(root) return name diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go index f5537411e..305c0f840 100644 --- a/pkg/sentry/fs/file_operations.go +++ b/pkg/sentry/fs/file_operations.go @@ -67,7 +67,7 @@ type SpliceOpts struct { // - File.Flags(): This value may change during the operation. type FileOperations interface { // Release release resources held by FileOperations. - Release() + Release(ctx context.Context) // Waitable defines how this File can be waited on for read and // write readiness. diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index dcc1df38f..9dc58d5ff 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -54,7 +54,7 @@ func overlayFile(ctx context.Context, inode *Inode, flags FileFlags) (*File, err // Drop the extra reference on the Dirent. Now there's only one reference // on the dirent, either owned by f (if non-nil), or the Dirent is about // to be destroyed (if GetFile failed). - dirent.DecRef() + dirent.DecRef(ctx) return f, err } @@ -89,12 +89,12 @@ type overlayFileOperations struct { } // Release implements FileOperations.Release. -func (f *overlayFileOperations) Release() { +func (f *overlayFileOperations) Release(ctx context.Context) { if f.upper != nil { - f.upper.DecRef() + f.upper.DecRef(ctx) } if f.lower != nil { - f.lower.DecRef() + f.lower.DecRef(ctx) } } @@ -164,7 +164,7 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error) { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &DirCtx{ @@ -497,7 +497,7 @@ func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) { if err != nil { return nil, err } - defer dir.DecRef() + defer dir.DecRef(ctx) // Use a stub serializer to read the entries into memory. stubSerializer := &CollectEntriesSerializer{} @@ -521,10 +521,10 @@ type overlayMappingIdentity struct { } // DecRef implements AtomicRefCount.DecRef. -func (omi *overlayMappingIdentity) DecRef() { - omi.AtomicRefCount.DecRefWithDestructor(func() { - omi.overlayFile.DecRef() - omi.id.DecRef() +func (omi *overlayMappingIdentity) DecRef(ctx context.Context) { + omi.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { + omi.overlayFile.DecRef(ctx) + omi.id.DecRef(ctx) }) } @@ -544,7 +544,7 @@ func (omi *overlayMappingIdentity) InodeID() uint64 { func (omi *overlayMappingIdentity) MappedName(ctx context.Context) string { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } name, _ := omi.overlayFile.Dirent.FullName(root) return name diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go index 08695391c..dc9efa5df 100644 --- a/pkg/sentry/fs/fsutil/file.go +++ b/pkg/sentry/fs/fsutil/file.go @@ -31,7 +31,7 @@ import ( type FileNoopRelease struct{} // Release is a no-op. -func (FileNoopRelease) Release() {} +func (FileNoopRelease) Release(context.Context) {} // SeekWithDirCursor is used to implement fs.FileOperations.Seek. If dirCursor // is not nil and the seek was on a directory, the cursor will be updated. @@ -296,7 +296,7 @@ func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, d *fs.Diren func (sdfo *StaticDirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index b2fcab127..c0bc63a32 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -114,7 +114,7 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF } // Release implements fs.FileOpeations.Release. -func (f *fileOperations) Release() { +func (f *fileOperations) Release(context.Context) { f.handles.DecRef() } @@ -122,7 +122,7 @@ func (f *fileOperations) Release() { func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go index 2df2fe889..326fed954 100644 --- a/pkg/sentry/fs/gofer/gofer_test.go +++ b/pkg/sentry/fs/gofer/gofer_test.go @@ -232,7 +232,7 @@ func TestRevalidation(t *testing.T) { // We must release the dirent, of the test will fail // with a reference leak. This is tracked by p9test. - defer dirent.DecRef() + defer dirent.DecRef(ctx) // Walk again. Depending on the cache policy, we may // get a new dirent. @@ -246,7 +246,7 @@ func TestRevalidation(t *testing.T) { if !test.preModificationWantReload && dirent != newDirent { t.Errorf("Lookup with cachePolicy=%s got new dirent %+v, wanted old dirent %+v", test.cachePolicy, newDirent, dirent) } - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. // Modify the underlying mocked file's modification // time for the next walk that occurs. @@ -287,7 +287,7 @@ func TestRevalidation(t *testing.T) { if test.postModificationWantUpdatedAttrs && gotModTimeSeconds != nowSeconds { t.Fatalf("Lookup with cachePolicy=%s got new modification time %v, wanted %v", test.cachePolicy, gotModTimeSeconds, nowSeconds) } - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. // Remove the file from the remote fs, subsequent walks // should now fail to find anything. @@ -303,7 +303,7 @@ func TestRevalidation(t *testing.T) { t.Errorf("Lookup with cachePolicy=%s got new dirent and error %v, wanted old dirent and nil error", test.cachePolicy, err) } if err == nil { - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. } }) } diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index fc14249be..f324dbf26 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -47,7 +47,8 @@ type handles struct { // DecRef drops a reference on handles. func (h *handles) DecRef() { - h.DecRefWithDestructor(func() { + ctx := context.Background() + h.DecRefWithDestructor(ctx, func(context.Context) { if h.Host != nil { if h.isHostBorrowed { h.Host.Release() @@ -57,7 +58,7 @@ func (h *handles) DecRef() { } } } - if err := h.File.close(context.Background()); err != nil { + if err := h.File.close(ctx); err != nil { log.Warningf("error closing p9 file: %v", err) } }) diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 51d7368a1..3a225fd39 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -441,8 +441,9 @@ func (i *inodeOperations) Release(ctx context.Context) { // asynchronously. // // We use AsyncWithContext to avoid needing to allocate an extra - // anonymous function on the heap. - fs.AsyncWithContext(ctx, i.fileState.Release) + // anonymous function on the heap. We must use background context + // because the async work cannot happen on the task context. + fs.AsyncWithContext(context.Background(), i.fileState.Release) } // Mappable implements fs.InodeOperations.Mappable. diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index cf9800100..3c66dc3c2 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -168,7 +168,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string // Construct the positive Dirent. d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - defer d.DecRef() + defer d.DecRef(ctx) // Construct the new file, caching the handles if allowed. h := handles{ @@ -371,7 +371,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string // Find out if file being deleted is a socket or pipe that needs to be // removed from endpoint map. if d, err := i.Lookup(ctx, dir, name); err == nil { - defer d.DecRef() + defer d.DecRef(ctx) if fs.IsSocket(d.Inode.StableAttr) || fs.IsPipe(d.Inode.StableAttr) { switch iops := d.Inode.InodeOperations.(type) { @@ -392,7 +392,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string return err } if key != nil { - i.session().overrides.remove(*key) + i.session().overrides.remove(ctx, *key) } i.touchModificationAndStatusChangeTime(ctx, dir) diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index b5efc86f2..7cf3522ff 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -89,10 +89,10 @@ func (e *overrideMaps) addPipe(key device.MultiDeviceKey, d *fs.Dirent, inode *f // remove deletes the key from the maps. // // Precondition: maps must have been locked with 'lock'. -func (e *overrideMaps) remove(key device.MultiDeviceKey) { +func (e *overrideMaps) remove(ctx context.Context, key device.MultiDeviceKey) { endpoint := e.keyMap[key] delete(e.keyMap, key) - endpoint.dirent.DecRef() + endpoint.dirent.DecRef(ctx) } // lock blocks other addition and removal operations from happening while @@ -197,7 +197,7 @@ type session struct { } // Destroy tears down the session. -func (s *session) Destroy() { +func (s *session) Destroy(ctx context.Context) { s.client.Close() } @@ -329,7 +329,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF s.client, err = p9.NewClient(conn, s.msize, s.version) if err != nil { // Drop our reference on the session, it needs to be torn down. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -340,7 +340,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF ctx.UninterruptibleSleepFinish(false) if err != nil { // Same as above. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -348,7 +348,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF if err != nil { s.attach.close(ctx) // Same as above, but after we execute the Close request. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -393,13 +393,13 @@ func (s *session) fillKeyMap(ctx context.Context) error { // fillPathMap populates paths for overrides from dirents in direntMap // before save. -func (s *session) fillPathMap() error { +func (s *session) fillPathMap(ctx context.Context) error { unlock := s.overrides.lock() defer unlock() for _, endpoint := range s.overrides.keyMap { mountRoot := endpoint.dirent.MountRoot() - defer mountRoot.DecRef() + defer mountRoot.DecRef(ctx) dirPath, _ := endpoint.dirent.FullName(mountRoot) if dirPath == "" { return fmt.Errorf("error getting path from dirent") diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index 2d398b753..48b423dd8 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -26,7 +26,8 @@ import ( // beforeSave is invoked by stateify. func (s *session) beforeSave() { if s.overrides != nil { - if err := s.fillPathMap(); err != nil { + ctx := &dummyClockContext{context.Background()} + if err := s.fillPathMap(ctx); err != nil { panic("failed to save paths to override map before saving" + err.Error()) } } diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 40f2c1cad..8a1c69ac2 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -134,14 +134,14 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect // We don't need the receiver. c.CloseRecv() - c.Release() + c.Release(ctx) return c, nil } // Release implements transport.BoundEndpoint.Release. -func (e *endpoint) Release() { - e.inode.DecRef() +func (e *endpoint) Release(ctx context.Context) { + e.inode.DecRef(ctx) } // Passcred implements transport.BoundEndpoint.Passcred. diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 39299b7e4..0d8d36afa 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -57,7 +57,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (c *scmRights) Release() { +func (c *scmRights) Release(ctx context.Context) { for _, fd := range c.fds { syscall.Close(fd) } diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index 3e48b8b2c..86d1a87f0 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -110,7 +110,7 @@ func newFileFromDonatedFD(ctx context.Context, donated int, saveable, isTTY bool name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID) dirent := fs.NewDirent(ctx, inode, name) - defer dirent.DecRef() + defer dirent.DecRef(ctx) if isTTY { return newTTYFile(ctx, dirent, flags, iops), nil @@ -169,7 +169,7 @@ func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go index c507f57eb..41a23b5da 100644 --- a/pkg/sentry/fs/host/inode_test.go +++ b/pkg/sentry/fs/host/inode_test.go @@ -36,7 +36,7 @@ func TestCloseFD(t *testing.T) { if err != nil { t.Fatalf("Failed to create File: %v", err) } - file.DecRef() + file.DecRef(ctx) s := make([]byte, 10) if c, err := syscall.Read(p[0], s); c != 0 || err != nil { diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index cfb089e43..a2f3d5918 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -194,7 +194,7 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) } // Send implements transport.ConnectedEndpoint.Send. -func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -271,7 +271,7 @@ func (c *ConnectedEndpoint) EventUpdate() { } // Recv implements transport.Receiver.Recv. -func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -318,7 +318,7 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek } // close releases all resources related to the endpoint. -func (c *ConnectedEndpoint) close() { +func (c *ConnectedEndpoint) close(context.Context) { fdnotifier.RemoveFD(int32(c.file.FD())) c.file.Close() c.file = nil @@ -374,8 +374,8 @@ func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 { } // Release implements transport.ConnectedEndpoint.Release and transport.Receiver.Release. -func (c *ConnectedEndpoint) Release() { - c.ref.DecRefWithDestructor(c.close) +func (c *ConnectedEndpoint) Release(ctx context.Context) { + c.ref.DecRefWithDestructor(ctx, c.close) } // CloseUnread implements transport.ConnectedEndpoint.CloseUnread. diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go index affdbcacb..9d58ea448 100644 --- a/pkg/sentry/fs/host/socket_test.go +++ b/pkg/sentry/fs/host/socket_test.go @@ -67,11 +67,12 @@ func TestSocketIsBlocking(t *testing.T) { if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK { t.Fatalf("Expected socket %v to be blocking", pair[1]) } - sock, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sock, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) failed => %v", pair[0], err) } - defer sock.DecRef() + defer sock.DecRef(ctx) // Test that the socket now is non-blocking. if fl, err = getFl(pair[0]); err != nil { t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err) @@ -93,11 +94,12 @@ func TestSocketWritev(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - socket, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + socket, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer socket.DecRef() + defer socket.DecRef(ctx) buf := []byte("hello world\n") n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(buf)) if err != nil { @@ -115,11 +117,12 @@ func TestSocketWritevLen0(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - socket, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + socket, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer socket.DecRef() + defer socket.DecRef(ctx) n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(nil)) if err != nil { t.Fatalf("socket writev failed: %v", err) @@ -136,11 +139,12 @@ func TestSocketSendMsgLen0(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - sfile, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sfile, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer sfile.DecRef() + defer sfile.DecRef(ctx) s := sfile.FileOperations.(socket.Socket) n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, false, ktime.Time{}, socket.ControlMessages{}) @@ -158,18 +162,19 @@ func TestListen(t *testing.T) { if err != nil { t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) } - sfile1, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sfile1, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer sfile1.DecRef() + defer sfile1.DecRef(ctx) socket1 := sfile1.FileOperations.(socket.Socket) - sfile2, err := newSocket(contexttest.Context(t), pair[1], false) + sfile2, err := newSocket(ctx, pair[1], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[1], err) } - defer sfile2.DecRef() + defer sfile2.DecRef(ctx) socket2 := sfile2.FileOperations.(socket.Socket) // Socketpairs can not be listened to. @@ -185,11 +190,11 @@ func TestListen(t *testing.T) { if err != nil { t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) } - sfile3, err := newSocket(contexttest.Context(t), sock, false) + sfile3, err := newSocket(ctx, sock, false) if err != nil { t.Fatalf("newSocket(%v) => %v", sock, err) } - defer sfile3.DecRef() + defer sfile3.DecRef(ctx) socket3 := sfile3.FileOperations.(socket.Socket) // This socket is not bound so we can't listen on it. @@ -237,9 +242,10 @@ func TestRelease(t *testing.T) { } c := &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} want := &ConnectedEndpoint{queue: c.queue} - want.ref.DecRef() + ctx := contexttest.Context(t) + want.ref.DecRef(ctx) fdnotifier.AddFD(int32(c.file.FD()), nil) - c.Release() + c.Release(ctx) if !reflect.DeepEqual(c, want) { t.Errorf("got = %#v, want = %#v", c, want) } diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 82a02fcb2..b5229098c 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -113,12 +113,12 @@ func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src userme } // Release implements fs.FileOperations.Release. -func (t *TTYFileOperations) Release() { +func (t *TTYFileOperations) Release(ctx context.Context) { t.mu.Lock() t.fgProcessGroup = nil t.mu.Unlock() - t.fileOperations.Release() + t.fileOperations.Release(ctx) } // Ioctl implements fs.FileOperations.Ioctl. diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go index ce397a5e3..c143f4ce2 100644 --- a/pkg/sentry/fs/host/wait_test.go +++ b/pkg/sentry/fs/host/wait_test.go @@ -39,7 +39,7 @@ func TestWait(t *testing.T) { t.Fatalf("NewFile failed: %v", err) } - defer file.DecRef() + defer file.DecRef(ctx) r := file.Readiness(waiter.EventIn) if r != 0 { diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index a34fbc946..b79cd9877 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -96,13 +96,12 @@ func NewInode(ctx context.Context, iops InodeOperations, msrc *MountSource, satt } // DecRef drops a reference on the Inode. -func (i *Inode) DecRef() { - i.DecRefWithDestructor(i.destroy) +func (i *Inode) DecRef(ctx context.Context) { + i.DecRefWithDestructor(ctx, i.destroy) } // destroy releases the Inode and releases the msrc reference taken. -func (i *Inode) destroy() { - ctx := context.Background() +func (i *Inode) destroy(ctx context.Context) { if err := i.WriteOut(ctx); err != nil { // FIXME(b/65209558): Mark as warning again once noatime is // properly supported. @@ -122,12 +121,12 @@ func (i *Inode) destroy() { i.Watches.targetDestroyed() if i.overlay != nil { - i.overlay.release() + i.overlay.release(ctx) } else { i.InodeOperations.Release(ctx) } - i.MountSource.DecRef() + i.MountSource.DecRef(ctx) } // Mappable calls i.InodeOperations.Mappable. diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go index efd3c962b..9911a00c2 100644 --- a/pkg/sentry/fs/inode_inotify.go +++ b/pkg/sentry/fs/inode_inotify.go @@ -17,6 +17,7 @@ package fs import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -136,11 +137,11 @@ func (w *Watches) Notify(name string, events, cookie uint32) { } // Unpin unpins dirent from all watches in this set. -func (w *Watches) Unpin(d *Dirent) { +func (w *Watches) Unpin(ctx context.Context, d *Dirent) { w.mu.RLock() defer w.mu.RUnlock() for _, watch := range w.ws { - watch.Unpin(d) + watch.Unpin(ctx, d) } } diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 537c8d257..dc2e353d9 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -85,7 +85,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name upperInode = child.Inode upperInode.IncRef() } - child.DecRef() + child.DecRef(ctx) } // Are we done? @@ -108,7 +108,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name entry, err := newOverlayEntry(ctx, upperInode, nil, false) if err != nil { // Don't leak resources. - upperInode.DecRef() + upperInode.DecRef(ctx) parent.copyMu.RUnlock() return nil, false, err } @@ -129,7 +129,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name if err != nil && err != syserror.ENOENT { // Don't leak resources. if upperInode != nil { - upperInode.DecRef() + upperInode.DecRef(ctx) } parent.copyMu.RUnlock() return nil, false, err @@ -152,7 +152,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name } } } - child.DecRef() + child.DecRef(ctx) } } @@ -183,7 +183,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // unnecessary because we don't need to copy-up and we will always // operate (e.g. read/write) on the upper Inode. if !IsDir(upperInode.StableAttr) { - lowerInode.DecRef() + lowerInode.DecRef(ctx) lowerInode = nil } } @@ -194,10 +194,10 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // Well, not quite, we failed at the last moment, how depressing. // Be sure not to leak resources. if upperInode != nil { - upperInode.DecRef() + upperInode.DecRef(ctx) } if lowerInode != nil { - lowerInode.DecRef() + lowerInode.DecRef(ctx) } parent.copyMu.RUnlock() return nil, false, err @@ -248,7 +248,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st // user) will clobber the real path for the underlying Inode. upperFile.Dirent.Inode.IncRef() upperDirent := NewTransientDirent(upperFile.Dirent.Inode) - upperFile.Dirent.DecRef() + upperFile.Dirent.DecRef(ctx) upperFile.Dirent = upperDirent // Create the overlay inode and dirent. We need this to construct the @@ -259,7 +259,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st // The overlay file created below with NewFile will take a reference on // the overlayDirent, and it should be the only thing holding a // reference at the time of creation, so we must drop this reference. - defer overlayDirent.DecRef() + defer overlayDirent.DecRef(ctx) // Create a new overlay file that wraps the upper file. flags.Pread = upperFile.Flags().Pread @@ -399,7 +399,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena if !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) { children, err := readdirOne(ctx, replaced) if err != nil { - replaced.DecRef() + replaced.DecRef(ctx) return err } @@ -407,12 +407,12 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena // included among the returned children, so we don't // need to bother checking for them. if len(children) > 0 { - replaced.DecRef() + replaced.DecRef(ctx) return syserror.ENOTEMPTY } } - replaced.DecRef() + replaced.DecRef(ctx) } } @@ -455,12 +455,12 @@ func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name stri // Grab the inode and drop the dirent, we don't need it. inode := d.Inode inode.IncRef() - d.DecRef() + d.DecRef(ctx) // Create a new overlay entry and dirent for the socket. entry, err := newOverlayEntry(ctx, inode, nil, false) if err != nil { - inode.DecRef() + inode.DecRef(ctx) return nil, err } // Use the parent's MountSource, since that corresponds to the overlay, @@ -672,7 +672,7 @@ func overlayGetlink(ctx context.Context, o *overlayEntry) (*Dirent, error) { // ground and claim that jumping around the filesystem like this // is not supported. name, _ := dirent.FullName(nil) - dirent.DecRef() + dirent.DecRef(ctx) // Claim that the path is not accessible. err = syserror.EACCES diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go index 389c219d6..aa9851b26 100644 --- a/pkg/sentry/fs/inode_overlay_test.go +++ b/pkg/sentry/fs/inode_overlay_test.go @@ -316,7 +316,7 @@ func TestCacheFlush(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) ctx = &rootContext{ Context: ctx, @@ -345,7 +345,7 @@ func TestCacheFlush(t *testing.T) { } // Drop the file reference. - file.DecRef() + file.DecRef(ctx) // Dirent should have 2 refs left. if got, want := dirent.ReadRefs(), 2; int(got) != want { @@ -361,7 +361,7 @@ func TestCacheFlush(t *testing.T) { } // Drop our ref. - dirent.DecRef() + dirent.DecRef(ctx) // We should be back to zero refs. if got, want := dirent.ReadRefs(), 0; int(got) != want { @@ -398,7 +398,7 @@ func (d *dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags if err != nil { return nil, err } - defer file.DecRef() + defer file.DecRef(ctx) // Wrap the file's FileOperations in a dirFile. fops := &dirFile{ FileOperations: file.FileOperations, diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index e3a715c1f..c5c07d564 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -80,7 +80,7 @@ func NewInotify(ctx context.Context) *Inotify { // Release implements FileOperations.Release. Release removes all watches and // frees all resources for an inotify instance. -func (i *Inotify) Release() { +func (i *Inotify) Release(ctx context.Context) { // We need to hold i.mu to avoid a race with concurrent calls to // Inotify.targetDestroyed from Watches. There's no risk of Watches // accessing this Inotify after the destructor ends, because we remove all @@ -93,7 +93,7 @@ func (i *Inotify) Release() { // the owner's destructor. w.target.Watches.Remove(w.ID()) // Don't leak any references to the target, held by pins in the watch. - w.destroy() + w.destroy(ctx) } } @@ -321,7 +321,7 @@ func (i *Inotify) AddWatch(target *Dirent, mask uint32) int32 { // // RmWatch looks up an inotify watch for the given 'wd' and configures the // target dirent to stop sending events to this inotify instance. -func (i *Inotify) RmWatch(wd int32) error { +func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { i.mu.Lock() // Find the watch we were asked to removed. @@ -346,7 +346,7 @@ func (i *Inotify) RmWatch(wd int32) error { i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0)) // Remove all pins. - watch.destroy() + watch.destroy(ctx) return nil } diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go index 900cba3ca..605423d22 100644 --- a/pkg/sentry/fs/inotify_watch.go +++ b/pkg/sentry/fs/inotify_watch.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -105,12 +106,12 @@ func (w *Watch) Pin(d *Dirent) { // Unpin drops any extra refs held on dirent due to a previous Pin // call. Calling Unpin multiple times for the same dirent, or on a dirent // without a corresponding Pin call is a no-op. -func (w *Watch) Unpin(d *Dirent) { +func (w *Watch) Unpin(ctx context.Context, d *Dirent) { w.mu.Lock() defer w.mu.Unlock() if w.pins[d] { delete(w.pins, d) - d.DecRef() + d.DecRef(ctx) } } @@ -125,11 +126,11 @@ func (w *Watch) TargetDestroyed() { // this watch. Destroy does not cause any new events to be generated. The caller // is responsible for ensuring there are no outstanding references to this // watch. -func (w *Watch) destroy() { +func (w *Watch) destroy(ctx context.Context) { w.mu.Lock() defer w.mu.Unlock() for d := range w.pins { - d.DecRef() + d.DecRef(ctx) } w.pins = nil } diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go index 37bae6810..ee69b10e8 100644 --- a/pkg/sentry/fs/mount.go +++ b/pkg/sentry/fs/mount.go @@ -51,7 +51,7 @@ type MountSourceOperations interface { DirentOperations // Destroy destroys the MountSource. - Destroy() + Destroy(ctx context.Context) // Below are MountSourceOperations that do not conform to Linux. @@ -165,16 +165,16 @@ func (msrc *MountSource) DecDirentRefs() { } } -func (msrc *MountSource) destroy() { +func (msrc *MountSource) destroy(ctx context.Context) { if c := msrc.DirentRefs(); c != 0 { panic(fmt.Sprintf("MountSource with non-zero direntRefs is being destroyed: %d", c)) } - msrc.MountSourceOperations.Destroy() + msrc.MountSourceOperations.Destroy(ctx) } // DecRef drops a reference on the MountSource. -func (msrc *MountSource) DecRef() { - msrc.DecRefWithDestructor(msrc.destroy) +func (msrc *MountSource) DecRef(ctx context.Context) { + msrc.DecRefWithDestructor(ctx, msrc.destroy) } // FlushDirentRefs drops all references held by the MountSource on Dirents. @@ -264,7 +264,7 @@ func (*SimpleMountSourceOperations) ResetInodeMappings() {} func (*SimpleMountSourceOperations) SaveInodeMapping(*Inode, string) {} // Destroy implements MountSourceOperations.Destroy. -func (*SimpleMountSourceOperations) Destroy() {} +func (*SimpleMountSourceOperations) Destroy(context.Context) {} // Info defines attributes of a filesystem. type Info struct { diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go index 78e35b1e6..7badc75d6 100644 --- a/pkg/sentry/fs/mount_overlay.go +++ b/pkg/sentry/fs/mount_overlay.go @@ -115,9 +115,9 @@ func (o *overlayMountSourceOperations) SaveInodeMapping(inode *Inode, path strin } // Destroy drops references on the upper and lower MountSource. -func (o *overlayMountSourceOperations) Destroy() { - o.upper.DecRef() - o.lower.DecRef() +func (o *overlayMountSourceOperations) Destroy(ctx context.Context) { + o.upper.DecRef(ctx) + o.lower.DecRef(ctx) } // type overlayFilesystem is the filesystem for overlay mounts. diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go index a3d10770b..6c296f5d0 100644 --- a/pkg/sentry/fs/mount_test.go +++ b/pkg/sentry/fs/mount_test.go @@ -18,6 +18,7 @@ import ( "fmt" "testing" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/contexttest" ) @@ -32,13 +33,13 @@ func cacheReallyContains(cache *DirentCache, d *Dirent) bool { return false } -func mountPathsAre(root *Dirent, got []*Mount, want ...string) error { +func mountPathsAre(ctx context.Context, root *Dirent, got []*Mount, want ...string) error { gotPaths := make(map[string]struct{}, len(got)) gotStr := make([]string, len(got)) for i, g := range got { if groot := g.Root(); groot != nil { name, _ := groot.FullName(root) - groot.DecRef() + groot.DecRef(ctx) gotStr[i] = name gotPaths[name] = struct{}{} } @@ -69,7 +70,7 @@ func TestMountSourceOnlyCachedOnce(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Get a child of the root which we will mount over. Note that the // MockInodeOperations causes Walk to always succeed. @@ -125,7 +126,7 @@ func TestAllMountsUnder(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Add mounts at the following paths: paths := []string{ @@ -150,14 +151,14 @@ func TestAllMountsUnder(t *testing.T) { if err := mm.Mount(ctx, d, submountInode); err != nil { t.Fatalf("could not mount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) } // mm root should contain all submounts (and does not include the root mount). rootMnt := mm.FindMount(rootDirent) submounts := mm.AllMountsUnder(rootMnt) allPaths := append(paths, "/") - if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil { t.Error(err) } @@ -181,9 +182,9 @@ func TestAllMountsUnder(t *testing.T) { if err != nil { t.Fatalf("could not find path %q in mount manager: %v", "/foo", err) } - defer d.DecRef() + defer d.DecRef(ctx) submounts = mm.AllMountsUnder(mm.FindMount(d)) - if err := mountPathsAre(rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil { t.Error(err) } @@ -193,9 +194,9 @@ func TestAllMountsUnder(t *testing.T) { if err != nil { t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err) } - defer waldo.DecRef() + defer waldo.DecRef(ctx) submounts = mm.AllMountsUnder(mm.FindMount(waldo)) - if err := mountPathsAre(rootDirent, submounts, "/waldo"); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, "/waldo"); err != nil { t.Error(err) } } @@ -212,7 +213,7 @@ func TestUnmount(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Add mounts at the following paths: paths := []string{ @@ -240,7 +241,7 @@ func TestUnmount(t *testing.T) { if err := mm.Mount(ctx, d, submountInode); err != nil { t.Fatalf("could not mount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) } allPaths := make([]string, len(paths)+1) @@ -259,13 +260,13 @@ func TestUnmount(t *testing.T) { if err := mm.Unmount(ctx, d, false); err != nil { t.Fatalf("could not unmount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) // Remove the path that has been unmounted and the check that the remaining // mounts are still there. allPaths = allPaths[:len(allPaths)-1] submounts := mm.AllMountsUnder(rootMnt) - if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil { t.Error(err) } } diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 3f2bd0e87..d741c4339 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -234,7 +234,7 @@ func (mns *MountNamespace) flushMountSourceRefsLocked() { // After destroy is called, the MountNamespace may continue to be referenced (for // example via /proc/mounts), but should free all resources and shouldn't have // Find* methods called. -func (mns *MountNamespace) destroy() { +func (mns *MountNamespace) destroy(ctx context.Context) { mns.mu.Lock() defer mns.mu.Unlock() @@ -247,13 +247,13 @@ func (mns *MountNamespace) destroy() { for _, mp := range mns.mounts { // Drop the mount reference on all mounted dirents. for ; mp != nil; mp = mp.previous { - mp.root.DecRef() + mp.root.DecRef(ctx) } } mns.mounts = nil // Drop reference on the root. - mns.root.DecRef() + mns.root.DecRef(ctx) // Ensure that root cannot be accessed via this MountNamespace any // more. @@ -265,8 +265,8 @@ func (mns *MountNamespace) destroy() { } // DecRef implements RefCounter.DecRef with destructor mns.destroy. -func (mns *MountNamespace) DecRef() { - mns.DecRefWithDestructor(mns.destroy) +func (mns *MountNamespace) DecRef(ctx context.Context) { + mns.DecRefWithDestructor(ctx, mns.destroy) } // withMountLocked prevents further walks to `node`, because `node` is about to @@ -312,7 +312,7 @@ func (mns *MountNamespace) Mount(ctx context.Context, mountPoint *Dirent, inode if err != nil { return err } - defer replacement.DecRef() + defer replacement.DecRef(ctx) // Set the mount's root dirent and id. parentMnt := mns.findMountLocked(mountPoint) @@ -394,7 +394,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly panic(fmt.Sprintf("Last mount in the chain must be a undo mount: %+v", prev)) } // Drop mount reference taken at the end of MountNamespace.Mount. - prev.root.DecRef() + prev.root.DecRef(ctx) } else { mns.mounts[prev.root] = prev } @@ -496,11 +496,11 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path // non-directory root is hopeless. if current != root { if !IsDir(current.Inode.StableAttr) { - current.DecRef() // Drop reference from above. + current.DecRef(ctx) // Drop reference from above. return nil, syserror.ENOTDIR } if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil { - current.DecRef() // Drop reference from above. + current.DecRef(ctx) // Drop reference from above. return nil, err } } @@ -511,12 +511,12 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path // Allow failed walks to cache the dirent, because no // children will acquire a reference at the end. current.maybeExtendReference() - current.DecRef() + current.DecRef(ctx) return nil, err } // Drop old reference. - current.DecRef() + current.DecRef(ctx) if remainder != "" { // Ensure it's resolved, unless it's the last level. @@ -570,11 +570,11 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema case nil: // Make sure we didn't exhaust the traversal budget. if *remainingTraversals == 0 { - target.DecRef() + target.DecRef(ctx) return nil, syscall.ELOOP } - node.DecRef() // Drop the original reference. + node.DecRef(ctx) // Drop the original reference. return target, nil case syscall.ENOLINK: @@ -582,7 +582,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema return node, nil case ErrResolveViaReadlink: - defer node.DecRef() // See above. + defer node.DecRef(ctx) // See above. // First, check if we should traverse. if *remainingTraversals == 0 { @@ -608,7 +608,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema return d, err default: - node.DecRef() // Drop for err; see above. + node.DecRef(ctx) // Drop for err; see above. // Propagate the error. return nil, err diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go index a69b41468..975d6cbc9 100644 --- a/pkg/sentry/fs/mounts_test.go +++ b/pkg/sentry/fs/mounts_test.go @@ -51,7 +51,7 @@ func TestFindLink(t *testing.T) { } root := mm.Root() - defer root.DecRef() + defer root.DecRef(ctx) foo, err := root.Walk(ctx, root, "foo") if err != nil { t.Fatalf("Error walking to foo: %v", err) diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index a8ae7d81d..35013a21b 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -107,7 +107,7 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount msrc := newOverlayMountSource(ctx, upper.MountSource, lower.MountSource, flags) overlay, err := newOverlayEntry(ctx, upper, lower, true) if err != nil { - msrc.DecRef() + msrc.DecRef(ctx) return nil, err } @@ -130,7 +130,7 @@ func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, msrc := newOverlayMountSource(ctx, upperMS, lower.MountSource, flags) overlay, err := newOverlayEntry(ctx, nil, lower, true) if err != nil { - msrc.DecRef() + msrc.DecRef(ctx) return nil, err } return newOverlayInode(ctx, overlay, msrc), nil @@ -230,16 +230,16 @@ func newOverlayEntry(ctx context.Context, upper *Inode, lower *Inode, lowerExist }, nil } -func (o *overlayEntry) release() { +func (o *overlayEntry) release(ctx context.Context) { // We drop a reference on upper and lower file system Inodes // rather than releasing them, because in-memory filesystems // may hold an extra reference to these Inodes so that they // stay in memory. if o.upper != nil { - o.upper.DecRef() + o.upper.DecRef(ctx) } if o.lower != nil { - o.lower.DecRef() + o.lower.DecRef(ctx) } } diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index 35972e23c..45523adf8 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -56,11 +56,11 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF // readDescriptors reads fds in the task starting at offset, and calls the // toDentAttr callback for each to get a DentAttr, which it then emits. This is // a helper for implementing fs.InodeOperations.Readdir. -func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { +func readDescriptors(ctx context.Context, t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { var fds []int32 t.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.GetFDs() + fds = fdTable.GetFDs(ctx) } }) @@ -116,7 +116,7 @@ func (f *fd) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error func (f *fd) Readlink(ctx context.Context, _ *fs.Inode) (string, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } n, _ := f.file.Dirent.FullName(root) return n, nil @@ -135,13 +135,7 @@ func (f *fd) Truncate(context.Context, *fs.Inode, int64) error { func (f *fd) Release(ctx context.Context) { f.Symlink.Release(ctx) - f.file.DecRef() -} - -// Close releases the reference on the file. -func (f *fd) Close() error { - f.file.DecRef() - return nil + f.file.DecRef(ctx) } // fdDir is an InodeOperations for /proc/TID/fd. @@ -227,7 +221,7 @@ func (f *fdDirFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySer if f.isInfoFile { typ = fs.Symlink } - return readDescriptors(f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr { + return readDescriptors(ctx, f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr { return fs.GenericDentAttr(typ, device.ProcDevice) }) } @@ -261,7 +255,7 @@ func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs // locks, and other data. For now we only have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags() - file.DecRef() + file.DecRef(ctx) contents := []byte(fmt.Sprintf("flags:\t0%o\n", flags)) return newStaticProcInode(ctx, dir.MountSource, contents) }) diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go index 1fc9c703c..6a63c47b3 100644 --- a/pkg/sentry/fs/proc/mounts.go +++ b/pkg/sentry/fs/proc/mounts.go @@ -47,7 +47,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { // The task has been destroyed. Nothing to show here. return } - defer rootDir.DecRef() + defer rootDir.DecRef(t) mnt := t.MountNamespace().FindMount(rootDir) if mnt == nil { @@ -64,7 +64,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { continue // No longer valid. } mountPath, desc := mroot.FullName(rootDir) - mroot.DecRef() + mroot.DecRef(t) if !desc { // MountSources that are not descendants of the chroot jail are ignored. continue @@ -97,7 +97,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se if mroot == nil { return // No longer valid. } - defer mroot.DecRef() + defer mroot.DecRef(ctx) // Format: // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue @@ -216,7 +216,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan if root == nil { return // No longer valid. } - defer root.DecRef() + defer root.DecRef(ctx) flags := root.Inode.MountSource.Flags opts := "rw" diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index bd18177d4..83a43aa26 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -419,7 +419,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } sfile := s.(*fs.File) if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { - s.DecRef() + s.DecRef(ctx) // Not a unix socket. continue } @@ -479,7 +479,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ @@ -574,7 +574,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) } if family, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { - s.DecRef() + s.DecRef(ctx) // Not tcp4 sockets. continue } @@ -664,7 +664,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ @@ -752,7 +752,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { - s.DecRef() + s.DecRef(ctx) // Not udp4 socket. continue } @@ -822,7 +822,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index c659224a7..77e0e1d26 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -213,7 +213,7 @@ func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dent // Add dot and dotdot. root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dot, dotdot := file.Dirent.GetDotAttrs(root) names = append(names, ".", "..") diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index 702fdd392..8615b60f0 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -272,6 +272,96 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque return n, f.tcpSack.stack.SetTCPSACKEnabled(*f.tcpSack.enabled) } +// +stateify savable +type tcpRecovery struct { + fsutil.SimpleFileInode + + stack inet.Stack `state:"wait"` + recovery inet.TCPLossRecovery +} + +func newTCPRecoveryInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { + ts := &tcpRecovery{ + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), + stack: s, + } + sattr := fs.StableAttr{ + DeviceID: device.ProcDevice.DeviceID(), + InodeID: device.ProcDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.SpecialFile, + } + return fs.NewInode(ctx, ts, msrc, sattr) +} + +// Truncate implements fs.InodeOperations.Truncate. +func (*tcpRecovery) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// GetFile implements fs.InodeOperations.GetFile. +func (r *tcpRecovery) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + flags.Pread = true + flags.Pwrite = true + return fs.NewFile(ctx, dirent, flags, &tcpRecoveryFile{ + tcpRecovery: r, + stack: r.stack, + }), nil +} + +// +stateify savable +type tcpRecoveryFile struct { + fsutil.FileGenericSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + tcpRecovery *tcpRecovery + + stack inet.Stack `state:"wait"` +} + +// Read implements fs.FileOperations.Read. +func (f *tcpRecoveryFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + return 0, io.EOF + } + + recovery, err := f.stack.TCPRecovery() + if err != nil { + return 0, err + } + f.tcpRecovery.recovery = recovery + s := fmt.Sprintf("%d\n", f.tcpRecovery.recovery) + n, err := dst.CopyOut(ctx, []byte(s)) + return int64(n), err +} + +// Write implements fs.FileOperations.Write. +func (f *tcpRecoveryFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + f.tcpRecovery.recovery = inet.TCPLossRecovery(v) + if err := f.tcpRecovery.stack.SetTCPRecovery(f.tcpRecovery.recovery); err != nil { + return 0, err + } + return n, nil +} + func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the @@ -351,6 +441,11 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine contents["tcp_wmem"] = newTCPMemInode(ctx, msrc, s, tcpWMem) } + // Add tcp_recovery. + if _, err := s.TCPRecovery(); err == nil { + contents["tcp_recovery"] = newTCPRecoveryInode(ctx, msrc, s) + } + d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) } diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 4bbe90198..9cf7f2a62 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -185,7 +185,7 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry // Serialize "." and "..". root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dot, dotdot := file.Dirent.GetDotAttrs(root) if err := dirCtx.DirEmit(".", dot); err != nil { @@ -295,7 +295,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { if err != nil { return "", err } - defer exec.DecRef() + defer exec.DecRef(ctx) return exec.PathnameWithDeleted(ctx), nil } diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index bfa304552..f4fcddecb 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -219,7 +219,7 @@ func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error { } // Remove our reference on the inode. - inode.DecRef() + inode.DecRef(ctx) return nil } @@ -250,7 +250,7 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err } // Remove our reference on the inode. - inode.DecRef() + inode.DecRef(ctx) return nil } @@ -326,7 +326,7 @@ func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.F // Create the Dirent and corresponding file. created := fs.NewDirent(ctx, inode, name) - defer created.DecRef() + defer created.DecRef(ctx) return created.Inode.GetFile(ctx, created, flags) } @@ -412,11 +412,11 @@ func (*Dir) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, ol } // Release implements fs.InodeOperation.Release. -func (d *Dir) Release(_ context.Context) { +func (d *Dir) Release(ctx context.Context) { // Drop references on all children. d.mu.Lock() for _, i := range d.children { - i.DecRef() + i.DecRef(ctx) } d.mu.Unlock() } @@ -456,7 +456,7 @@ func (dfo *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirC func (dfo *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, @@ -473,13 +473,13 @@ func hasChildren(ctx context.Context, inode *fs.Inode) (bool, error) { // dropped when that dirent is destroyed. inode.IncRef() d := fs.NewTransientDirent(inode) - defer d.DecRef() + defer d.DecRef(ctx) file, err := inode.GetFile(ctx, d, fs.FileFlags{Read: true}) if err != nil { return false, err } - defer file.DecRef() + defer file.DecRef(ctx) ser := &fs.CollectEntriesSerializer{} if err := file.Readdir(ctx, ser); err != nil { @@ -530,7 +530,7 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n if err != nil { return err } - inode.DecRef() + inode.DecRef(ctx) } // Be careful, we may have already grabbed this mutex above. diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go index a6ed8b2c5..3e0d1e07e 100644 --- a/pkg/sentry/fs/ramfs/tree_test.go +++ b/pkg/sentry/fs/ramfs/tree_test.go @@ -67,7 +67,7 @@ func TestMakeDirectoryTree(t *testing.T) { continue } root := mm.Root() - defer mm.DecRef() + defer mm.DecRef(ctx) for _, p := range test.subdirs { maxTraversals := uint(0) diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index 88c344089..f362ca9b6 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -55,7 +55,7 @@ type TimerOperations struct { func NewFile(ctx context.Context, c ktime.Clock) *fs.File { dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[timerfd]") // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) tops := &TimerOperations{} tops.timer = ktime.NewTimer(c, tops) // Timerfds reject writes, but the Write flag must be set in order to @@ -65,7 +65,7 @@ func NewFile(ctx context.Context, c ktime.Clock) *fs.File { } // Release implements fs.FileOperations.Release. -func (t *TimerOperations) Release() { +func (t *TimerOperations) Release(context.Context) { t.timer.Destroy() } diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go index aaba35502..d4d613ea9 100644 --- a/pkg/sentry/fs/tmpfs/file_test.go +++ b/pkg/sentry/fs/tmpfs/file_test.go @@ -46,7 +46,7 @@ func newFile(ctx context.Context) *fs.File { func TestGrow(t *testing.T) { ctx := contexttest.Context(t) f := newFile(ctx) - defer f.DecRef() + defer f.DecRef(ctx) abuf := bytes.Repeat([]byte{'a'}, 68) n, err := f.Pwritev(ctx, usermem.BytesIOSequence(abuf), 0) diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 108654827..463f6189e 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -132,7 +132,7 @@ func (d *dirInodeOperations) Release(ctx context.Context) { d.mu.Lock() defer d.mu.Unlock() - d.master.DecRef() + d.master.DecRef(ctx) if len(d.slaves) != 0 { panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d)) } @@ -263,7 +263,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e } // masterClose is called when the master end of t is closed. -func (d *dirInodeOperations) masterClose(t *Terminal) { +func (d *dirInodeOperations) masterClose(ctx context.Context, t *Terminal) { d.mu.Lock() defer d.mu.Unlock() @@ -277,7 +277,7 @@ func (d *dirInodeOperations) masterClose(t *Terminal) { panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d)) } - s.DecRef() + s.DecRef(ctx) delete(d.slaves, t.n) d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10)) } @@ -322,7 +322,7 @@ func (df *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirCt func (df *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go index 8fe05ebe5..2d4d44bf3 100644 --- a/pkg/sentry/fs/tty/fs.go +++ b/pkg/sentry/fs/tty/fs.go @@ -108,4 +108,4 @@ func (superOperations) ResetInodeMappings() {} func (superOperations) SaveInodeMapping(*fs.Inode, string) {} // Destroy implements MountSourceOperations.Destroy. -func (superOperations) Destroy() {} +func (superOperations) Destroy(context.Context) {} diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index fe07fa929..e00746017 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -75,7 +75,7 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn } // Release implements fs.InodeOperations.Release. -func (mi *masterInodeOperations) Release(ctx context.Context) { +func (mi *masterInodeOperations) Release(context.Context) { } // Truncate implements fs.InodeOperations.Truncate. @@ -120,9 +120,9 @@ type masterFileOperations struct { var _ fs.FileOperations = (*masterFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (mf *masterFileOperations) Release() { - mf.d.masterClose(mf.t) - mf.t.DecRef() +func (mf *masterFileOperations) Release(ctx context.Context) { + mf.d.masterClose(ctx, mf.t) + mf.t.DecRef(ctx) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go index 9871f6fc6..7c7292687 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/slave.go @@ -71,7 +71,7 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne // Release implements fs.InodeOperations.Release. func (si *slaveInodeOperations) Release(ctx context.Context) { - si.t.DecRef() + si.t.DecRef(ctx) } // Truncate implements fs.InodeOperations.Truncate. @@ -106,7 +106,7 @@ type slaveFileOperations struct { var _ fs.FileOperations = (*slaveFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (sf *slaveFileOperations) Release() { +func (sf *slaveFileOperations) Release(context.Context) { } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go index 397e96045..2f5a43b84 100644 --- a/pkg/sentry/fs/user/path.go +++ b/pkg/sentry/fs/user/path.go @@ -82,7 +82,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s // Caller has no root. Don't bother traversing anything. return "", syserror.ENOENT } - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range paths { if !path.IsAbs(p) { // Relative paths aren't safe, no one should be using them. @@ -100,7 +100,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s if err != nil { return "", err } - defer d.DecRef() + defer d.DecRef(ctx) // Check that it is a regular file. if !fs.IsRegular(d.Inode.StableAttr) { @@ -121,7 +121,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range paths { if !path.IsAbs(p) { // Relative paths aren't safe, no one should be using them. @@ -148,7 +148,7 @@ func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNam if err != nil { return "", err } - dentry.DecRef() + dentry.DecRef(ctx) return binPath, nil } diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go index f4d525523..936fd3932 100644 --- a/pkg/sentry/fs/user/user.go +++ b/pkg/sentry/fs/user/user.go @@ -62,7 +62,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K // doesn't exist we will return the default home directory. return defaultHome, nil } - defer dirent.DecRef() + defer dirent.DecRef(ctx) // Check read permissions on the file. if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil { @@ -81,7 +81,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K if err != nil { return "", err } - defer f.DecRef() + defer f.DecRef(ctx) r := &fileReader{ Ctx: ctx, @@ -105,7 +105,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth. const defaultHome = "/" root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) creds := auth.CredentialsFromContext(ctx) @@ -123,7 +123,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth. if err != nil { return defaultHome, nil } - defer fd.DecRef() + defer fd.DecRef(ctx) r := &fileReaderVFS2{ ctx: ctx, diff --git a/pkg/sentry/fs/user/user_test.go b/pkg/sentry/fs/user/user_test.go index 7d8e9ac7c..12b786224 100644 --- a/pkg/sentry/fs/user/user_test.go +++ b/pkg/sentry/fs/user/user_test.go @@ -39,7 +39,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode if err != nil { return err } - defer etc.DecRef() + defer etc.DecRef(ctx) switch mode.FileType() { case 0: // Don't create anything. @@ -49,7 +49,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode if err != nil { return err } - defer passwd.DecRef() + defer passwd.DecRef(ctx) if _, err := passwd.Writev(ctx, usermem.BytesIOSequence([]byte(contents))); err != nil { return err } @@ -110,9 +110,9 @@ func TestGetExecUserHome(t *testing.T) { if err != nil { t.Fatalf("NewMountNamespace failed: %v", err) } - defer mns.DecRef() + defer mns.DecRef(ctx) root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) ctx = fs.WithRoot(ctx, root) if err := createEtcPasswd(ctx, root, tc.passwdContents, tc.passwdMode); err != nil { diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go index 8e7590721..7e61209ee 100644 --- a/pkg/sentry/fsbridge/bridge.go +++ b/pkg/sentry/fsbridge/bridge.go @@ -44,7 +44,7 @@ type File interface { IncRef() // DecRef decrements reference. - DecRef() + DecRef(ctx context.Context) } // Lookup provides a common interface to open files. diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go index 093ce1fb3..9785fd62a 100644 --- a/pkg/sentry/fsbridge/fs.go +++ b/pkg/sentry/fsbridge/fs.go @@ -49,7 +49,7 @@ func (f *fsFile) PathnameWithDeleted(ctx context.Context) string { // global there. return "" } - defer root.DecRef() + defer root.DecRef(ctx) name, _ := f.file.Dirent.FullName(root) return name @@ -87,8 +87,8 @@ func (f *fsFile) IncRef() { } // DecRef implements File. -func (f *fsFile) DecRef() { - f.file.DecRef() +func (f *fsFile) DecRef(ctx context.Context) { + f.file.DecRef(ctx) } // fsLookup implements Lookup interface using fs.File. @@ -124,7 +124,7 @@ func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptio if err != nil { return nil, err } - defer d.DecRef() + defer d.DecRef(ctx) if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) { return nil, syserror.ELOOP diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go index 89168220a..323506d33 100644 --- a/pkg/sentry/fsbridge/vfs.go +++ b/pkg/sentry/fsbridge/vfs.go @@ -43,7 +43,7 @@ func NewVFSFile(file *vfs.FileDescription) File { // PathnameWithDeleted implements File. func (f *VFSFile) PathnameWithDeleted(ctx context.Context) string { root := vfs.RootFromContext(ctx) - defer root.DecRef() + defer root.DecRef(ctx) vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry()) @@ -86,8 +86,8 @@ func (f *VFSFile) IncRef() { } // DecRef implements File. -func (f *VFSFile) DecRef() { - f.file.DecRef() +func (f *VFSFile) DecRef(ctx context.Context) { + f.file.DecRef(ctx) } // FileDescription returns the FileDescription represented by f. It does not diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index e6fda2b4f..7169e91af 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -103,9 +103,9 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // rootInode is the root directory inode for the devpts mounts. diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 1081fff52..3bb397f71 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -60,7 +60,7 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf } fd.LockFD.Init(&mi.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { - mi.DecRef() + mi.DecRef(ctx) return nil, err } return &fd.vfsfd, nil @@ -98,9 +98,9 @@ type masterFileDescription struct { var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (mfd *masterFileDescription) Release() { +func (mfd *masterFileDescription) Release(ctx context.Context) { mfd.inode.root.masterClose(mfd.t) - mfd.inode.DecRef() + mfd.inode.DecRef(ctx) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index a91cae3ef..32e4e1908 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -56,7 +56,7 @@ func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs } fd.LockFD.Init(&si.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { - si.DecRef() + si.DecRef(ctx) return nil, err } return &fd.vfsfd, nil @@ -103,8 +103,8 @@ type slaveFileDescription struct { var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil) // Release implements fs.FileOperations.Release. -func (sfd *slaveFileDescription) Release() { - sfd.inode.DecRef() +func (sfd *slaveFileDescription) Release(ctx context.Context) { + sfd.inode.DecRef(ctx) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index d0e06cdc0..2ed5fa8a9 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -92,9 +92,9 @@ func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth } // Release must be called when a is no longer in use. -func (a *Accessor) Release() { - a.root.DecRef() - a.mntns.DecRef() +func (a *Accessor) Release(ctx context.Context) { + a.root.DecRef(ctx) + a.mntns.DecRef(ctx) } // accessorContext implements context.Context by extending an existing diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index b6d52c015..747867cca 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -30,7 +30,7 @@ func TestDevtmpfs(t *testing.T) { creds := auth.CredentialsFromContext(ctx) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Register tmpfs just so that we can have a root filesystem that isn't @@ -48,9 +48,9 @@ func TestDevtmpfs(t *testing.T) { if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) devpop := vfs.PathOperation{ Root: root, Start: root, @@ -69,7 +69,7 @@ func TestDevtmpfs(t *testing.T) { if err != nil { t.Fatalf("failed to create devtmpfs.Accessor: %v", err) } - defer a.Release() + defer a.Release(ctx) // Create "userspace-initialized" files using a devtmpfs.Accessor. if err := a.UserspaceInit(ctx); err != nil { diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index d12d78b84..812171fa3 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -59,9 +59,9 @@ type EventFileDescription struct { var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil) // New creates a new event fd. -func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { +func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[eventfd]") - defer vd.DecRef() + defer vd.DecRef(ctx) efd := &EventFileDescription{ val: initVal, semMode: semMode, @@ -107,7 +107,7 @@ func (efd *EventFileDescription) HostFD() (int, error) { } // Release implements FileDescriptionImpl.Release() -func (efd *EventFileDescription) Release() { +func (efd *EventFileDescription) Release(context.Context) { efd.mu.Lock() defer efd.mu.Unlock() if efd.hostfd >= 0 { diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go index 20e3adffc..49916fa81 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd_test.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go @@ -36,16 +36,16 @@ func TestEventFD(t *testing.T) { for _, initVal := range initVals { ctx := contexttest.Context(t) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Make a new eventfd that is writable. - eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR) + eventfd, err := New(ctx, vfsObj, initVal, false, linux.O_RDWR) if err != nil { t.Fatalf("New() failed: %v", err) } - defer eventfd.DecRef() + defer eventfd.DecRef(ctx) // Register a callback for a write event. w, ch := waiter.NewChannelEntry(nil) @@ -74,16 +74,16 @@ func TestEventFD(t *testing.T) { func TestEventFDStat(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Make a new eventfd that is writable. - eventfd, err := New(vfsObj, 0, false, linux.O_RDWR) + eventfd, err := New(ctx, vfsObj, 0, false, linux.O_RDWR) if err != nil { t.Fatalf("New() failed: %v", err) } - defer eventfd.DecRef() + defer eventfd.DecRef(ctx) statx, err := eventfd.Stat(ctx, vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS, diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 89caee3df..8f7d5a9bb 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -53,7 +53,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { return nil, nil, nil, nil, err } vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -68,7 +68,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys root := mntns.Root() tearDown := func() { - root.DecRef() + root.DecRef(ctx) if err := f.Close(); err != nil { b.Fatalf("tearDown failed: %v", err) @@ -169,7 +169,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) // Create extfs submount. mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop) diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index 55902322a..7a1b4219f 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -15,6 +15,7 @@ package ext import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -55,7 +56,7 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { +func (d *dentry) DecRef(ctx context.Context) { // FIXME(b/134676337): filesystem.mu may not be locked as required by // inode.decRef(). d.inode.decRef() @@ -64,7 +65,7 @@ func (d *dentry) DecRef() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // // TODO(b/134676337): Implement inotify. -func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {} +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // @@ -76,4 +77,4 @@ func (d *dentry) Watches() *vfs.Watches { // OnZeroWatches implements vfs.Dentry.OnZeroWatches. // // TODO(b/134676337): Implement inotify. -func (d *dentry) OnZeroWatches() {} +func (d *dentry) OnZeroWatches(context.Context) {} diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 357512c7e..0fc01668d 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -142,7 +142,7 @@ type directoryFD struct { var _ vfs.FileDescriptionImpl = (*directoryFD)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(ctx context.Context) { if fd.iter == nil { return } diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index dac6effbf..08ffc2834 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -123,32 +123,32 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.vfsfs.Init(vfsObj, &fsType, &fs) fs.sb, err = readSuperBlock(dev) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { // mount(2) specifies that EINVAL should be returned if the superblock is // invalid. - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL } // Refuse to mount if the filesystem is incompatible. if !isCompatible(fs.sb) { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL } fs.bgs, err = readBlockGroups(dev, fs.sb) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } rootInode.incRef() diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 64e9a579f..2dbaee287 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -65,7 +65,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -80,7 +80,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys root := mntns.Root() tearDown := func() { - root.DecRef() + root.DecRef(ctx) if err := f.Close(); err != nil { t.Fatalf("tearDown failed: %v", err) diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 557963e03..c714ddf73 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -84,7 +84,7 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil) // - filesystem.mu must be locked (for writing if write param is true). // - !rp.Done(). // - inode == vfsd.Impl().(*Dentry).inode. -func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { +func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { if !inode.isDir() { return nil, nil, syserror.ENOTDIR } @@ -100,7 +100,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo } d := vfsd.Impl().(*dentry) if name == ".." { - isRoot, err := rp.CheckRoot(vfsd) + isRoot, err := rp.CheckRoot(ctx, vfsd) if err != nil { return nil, nil, err } @@ -108,7 +108,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo rp.Advance() return vfsd, inode, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, nil, err } rp.Advance() @@ -143,7 +143,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo child.name = name dir.childCache[name] = child } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, nil, err } if child.inode.isSymlink() && rp.ShouldFollowSymlink() { @@ -167,12 +167,12 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo // // Preconditions: // - filesystem.mu must be locked (for writing if write param is true). -func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { +func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() inode := vfsd.Impl().(*dentry).inode for !rp.Done() { var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) if err != nil { return nil, nil, err } @@ -196,12 +196,12 @@ func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) // Preconditions: // - filesystem.mu must be locked (for writing if write param is true). // - !rp.Done(). -func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { +func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() inode := vfsd.Impl().(*dentry).inode for !rp.Final() { var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) if err != nil { return nil, nil, err } @@ -216,7 +216,7 @@ func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, e // the rp till the parent of the last component which should be an existing // directory. If parent is false then resolves rp entirely. Attemps to resolve // the path as far as it can with a read lock and upgrades the lock if needed. -func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { +func (fs *filesystem) walk(ctx context.Context, rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { var ( vfsd *vfs.Dentry inode *inode @@ -227,9 +227,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in // of disk. This reduces congestion (allows concurrent walks). fs.mu.RLock() if parent { - vfsd, inode, err = walkParentLocked(rp, false) + vfsd, inode, err = walkParentLocked(ctx, rp, false) } else { - vfsd, inode, err = walkLocked(rp, false) + vfsd, inode, err = walkLocked(ctx, rp, false) } fs.mu.RUnlock() @@ -238,9 +238,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in // walk is fine as this is a read only filesystem. fs.mu.Lock() if parent { - vfsd, inode, err = walkParentLocked(rp, true) + vfsd, inode, err = walkParentLocked(ctx, rp, true) } else { - vfsd, inode, err = walkLocked(rp, true) + vfsd, inode, err = walkLocked(ctx, rp, true) } fs.mu.Unlock() } @@ -283,7 +283,7 @@ func (fs *filesystem) statTo(stat *linux.Statfs) { // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -292,7 +292,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(rp, false) + vfsd, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -312,7 +312,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(rp, true) + vfsd, inode, err := fs.walk(ctx, rp, true) if err != nil { return nil, err } @@ -322,7 +322,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - vfsd, inode, err := fs.walk(rp, false) + vfsd, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -336,7 +336,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return "", err } @@ -349,7 +349,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return linux.Statx{}, err } @@ -360,7 +360,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { - if _, _, err := fs.walk(rp, false); err != nil { + if _, _, err := fs.walk(ctx, rp, false); err != nil { return linux.Statfs{}, err } @@ -370,7 +370,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } @@ -390,7 +390,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EEXIST } - if _, _, err := fs.walk(rp, true); err != nil { + if _, _, err := fs.walk(ctx, rp, true); err != nil { return err } @@ -403,7 +403,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } - if _, _, err := fs.walk(rp, true); err != nil { + if _, _, err := fs.walk(ctx, rp, true); err != nil { return err } @@ -416,7 +416,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } - _, _, err := fs.walk(rp, true) + _, _, err := fs.walk(ctx, rp, true) if err != nil { return err } @@ -430,7 +430,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.ENOENT } - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -440,7 +440,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -454,7 +454,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -468,7 +468,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return syserror.EEXIST } - _, _, err := fs.walk(rp, true) + _, _, err := fs.walk(ctx, rp, true) if err != nil { return err } @@ -478,7 +478,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -492,7 +492,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -506,7 +506,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -515,7 +515,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return "", err } @@ -524,7 +524,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -533,7 +533,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index 66d14bb95..e73e740d6 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -79,7 +79,7 @@ type regularFileFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() {} +func (fd *regularFileFD) Release(context.Context) {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index 62efd4095..2fd0d1fa8 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -73,7 +73,7 @@ type symlinkFD struct { var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (fd *symlinkFD) Release() {} +func (fd *symlinkFD) Release(context.Context) {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 67649e811..999111deb 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -21,6 +21,7 @@ go_library( "connection.go", "dev.go", "fusefs.go", + "init.go", "register.go", "request_list.go", ], @@ -44,14 +45,13 @@ go_library( ) go_test( - name = "dev_test", + name = "fuse_test", size = "small", srcs = ["dev_test.go"], library = ":fuse", deps = [ "//pkg/abi/linux", "//pkg/sentry/fsimpl/testutil", - "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go index f330da0bd..6df2728ab 100644 --- a/pkg/sentry/fsimpl/fuse/connection.go +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -17,6 +17,8 @@ package fuse import ( "errors" "fmt" + "sync" + "sync/atomic" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,18 +27,29 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" "gvisor.dev/gvisor/tools/go_marshal/marshal" ) -// MaxActiveRequestsDefault is the default setting controlling the upper bound +// maxActiveRequestsDefault is the default setting controlling the upper bound // on the number of active requests at any given time. -const MaxActiveRequestsDefault = 10000 +const maxActiveRequestsDefault = 10000 -var ( - // Ordinary requests have even IDs, while interrupts IDs are odd. - InitReqBit uint64 = 1 - ReqIDStep uint64 = 2 +// Ordinary requests have even IDs, while interrupts IDs are odd. +// Used to increment the unique ID for each FUSE request. +var reqIDStep uint64 = 2 + +const ( + // fuseDefaultMaxBackground is the default value for MaxBackground. + fuseDefaultMaxBackground = 12 + + // fuseDefaultCongestionThreshold is the default value for CongestionThreshold, + // and is 75% of the default maximum of MaxGround. + fuseDefaultCongestionThreshold = (fuseDefaultMaxBackground * 3 / 4) + + // fuseDefaultMaxPagesPerReq is the default value for MaxPagesPerReq. + fuseDefaultMaxPagesPerReq = 32 ) // Request represents a FUSE operation request that hasn't been sent to the @@ -61,17 +74,125 @@ type Response struct { data []byte } -// Connection is the struct by which the sentry communicates with the FUSE server daemon. -type Connection struct { +// connection is the struct by which the sentry communicates with the FUSE server daemon. +type connection struct { fd *DeviceFD - // MaxWrite is the daemon's maximum size of a write buffer. - // This is negotiated during FUSE_INIT. - MaxWrite uint32 + // The following FUSE_INIT flags are currently unsupported by this implementation: + // - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC) + // - FUSE_EXPORT_SUPPORT + // - FUSE_HANDLE_KILLPRIV + // - FUSE_POSIX_LOCKS: requires POSIX locks + // - FUSE_FLOCK_LOCKS: requires POSIX locks + // - FUSE_AUTO_INVAL_DATA: requires page caching eviction + // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction + // - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation + // - FUSE_ASYNC_DIO + // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler + + // initialized after receiving FUSE_INIT reply. + // Until it's set, suspend sending FUSE requests. + // Use SetInitialized() and IsInitialized() for atomic access. + initialized int32 + + // initializedChan is used to block requests before initialization. + initializedChan chan struct{} + + // blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground). + // TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block. + blocked bool + + // connected (connection established) when a new FUSE file system is created. + // Set to false when: + // umount, + // connection abort, + // device release. + connected bool + + // aborted via sysfs. + // TODO(gvisor.dev/issue/3185): abort all queued requests. + aborted bool + + // connInitError if FUSE_INIT encountered error (major version mismatch). + // Only set in INIT. + connInitError bool + + // connInitSuccess if FUSE_INIT is successful. + // Only set in INIT. + // Used for destory. + connInitSuccess bool + + // TODO(gvisor.dev/issue/3185): All the queue logic are working in progress. + + // NumberBackground is the number of requests in the background. + numBackground uint16 + + // congestionThreshold for NumBackground. + // Negotiated in FUSE_INIT. + congestionThreshold uint16 + + // maxBackground is the maximum number of NumBackground. + // Block connection when it is reached. + // Negotiated in FUSE_INIT. + maxBackground uint16 + + // numActiveBackground is the number of requests in background and has being marked as active. + numActiveBackground uint16 + + // numWating is the number of requests waiting for completion. + numWaiting uint32 + + // TODO(gvisor.dev/issue/3185): BgQueue + // some queue for background queued requests. + + // bgLock protects: + // MaxBackground, CongestionThreshold, NumBackground, + // NumActiveBackground, BgQueue, Blocked. + bgLock sync.Mutex + + // maxRead is the maximum size of a read buffer in in bytes. + maxRead uint32 + + // maxWrite is the maximum size of a write buffer in bytes. + // Negotiated in FUSE_INIT. + maxWrite uint32 + + // maxPages is the maximum number of pages for a single request to use. + // Negotiated in FUSE_INIT. + maxPages uint16 + + // minor version of the FUSE protocol. + // Negotiated and only set in INIT. + minor uint32 + + // asyncRead if read pages asynchronously. + // Negotiated and only set in INIT. + asyncRead bool + + // abortErr is true if kernel need to return an unique read error after abort. + // Negotiated and only set in INIT. + abortErr bool + + // writebackCache is true for write-back cache policy, + // false for write-through policy. + // Negotiated and only set in INIT. + writebackCache bool + + // cacheSymlinks if filesystem needs to cache READLINK responses in page cache. + // Negotiated and only set in INIT. + cacheSymlinks bool + + // bigWrites if doing multi-page cached writes. + // Negotiated and only set in INIT. + bigWrites bool + + // dontMask if filestestem does not apply umask to creation modes. + // Negotiated in INIT. + dontMask bool } -// NewFUSEConnection creates a FUSE connection to fd -func NewFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*Connection, error) { +// newFUSEConnection creates a FUSE connection to fd. +func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) { // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to // mount a FUSE filesystem. fuseFD := fd.Impl().(*DeviceFD) @@ -84,16 +205,41 @@ func NewFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRe fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests) fuseFD.writeCursor = 0 - return &Connection{ - fd: fuseFD, + return &connection{ + fd: fuseFD, + maxBackground: fuseDefaultMaxBackground, + congestionThreshold: fuseDefaultCongestionThreshold, + maxPages: fuseDefaultMaxPagesPerReq, + initializedChan: make(chan struct{}), + connected: true, }, nil } +// SetInitialized atomically sets the connection as initialized. +func (conn *connection) SetInitialized() { + // Unblock the requests sent before INIT. + close(conn.initializedChan) + + // Close the channel first to avoid the non-atomic situation + // where conn.initialized is true but there are + // tasks being blocked on the channel. + // And it prevents the newer tasks from gaining + // unnecessary higher chance to be issued before the blocked one. + + atomic.StoreInt32(&(conn.initialized), int32(1)) +} + +// IsInitialized atomically check if the connection is initialized. +// pairs with SetInitialized(). +func (conn *connection) Initialized() bool { + return atomic.LoadInt32(&(conn.initialized)) != 0 +} + // NewRequest creates a new request that can be sent to the FUSE server. -func (conn *Connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { +func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { conn.fd.mu.Lock() defer conn.fd.mu.Unlock() - conn.fd.nextOpID += linux.FUSEOpID(ReqIDStep) + conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() hdr := linux.FUSEHeaderIn{ @@ -118,13 +264,49 @@ func (conn *Connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint } // Call makes a request to the server and blocks the invoking task until a -// server responds with a response. -// NOTE: If no task is provided then the Call will simply enqueue the request -// and return a nil response. No blocking will happen in this case. Instead, -// this is used to signify that the processing of this request will happen by -// the kernel.Task that writes the response. See FUSE_INIT for such an -// invocation. -func (conn *Connection) Call(t *kernel.Task, r *Request) (*Response, error) { +// server responds with a response. Task should never be nil. +// Requests will not be sent before the connection is initialized. +// For async tasks, use CallAsync(). +func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { + // Block requests sent before connection is initalized. + if !conn.Initialized() { + if err := t.Block(conn.initializedChan); err != nil { + return nil, err + } + } + + return conn.call(t, r) +} + +// CallAsync makes an async (aka background) request. +// Those requests either do not expect a response (e.g. release) or +// the response should be handled by others (e.g. init). +// Return immediately unless the connection is blocked (before initialization). +// Async call example: init, release, forget, aio, interrupt. +// When the Request is FUSE_INIT, it will not be blocked before initialization. +func (conn *connection) CallAsync(t *kernel.Task, r *Request) error { + // Block requests sent before connection is initalized. + if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT { + if err := t.Block(conn.initializedChan); err != nil { + return err + } + } + + // This should be the only place that invokes call() with a nil task. + _, err := conn.call(nil, r) + return err +} + +// call makes a call without blocking checks. +func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) { + if !conn.connected { + return nil, syserror.ENOTCONN + } + + if conn.connInitError { + return nil, syserror.ECONNREFUSED + } + fut, err := conn.callFuture(t, r) if err != nil { return nil, err @@ -160,7 +342,7 @@ func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { // callFuture makes a request to the server and returns a future response. // Call resolve() when the response needs to be fulfilled. -func (conn *Connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { +func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { conn.fd.mu.Lock() defer conn.fd.mu.Unlock() @@ -195,7 +377,7 @@ func (conn *Connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, } // callFutureLocked makes a request to the server and returns a future response. -func (conn *Connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { +func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { conn.fd.queue.PushBack(r) conn.fd.numActiveRequests += 1 fut := newFutureResponse(r.hdr.Opcode) diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index f3443ac71..e522ff9a0 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -98,7 +99,9 @@ type DeviceFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DeviceFD) Release() {} +func (fd *DeviceFD) Release(context.Context) { + fd.fs.conn.connected = false +} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { @@ -124,7 +127,7 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R minBuffSize := linux.FUSE_MIN_READ_BUFFER inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes()) writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes()) - negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.MaxWrite + negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite if minBuffSize < negotiatedMinBuffSize { minBuffSize = negotiatedMinBuffSize } @@ -385,9 +388,9 @@ func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) er // FUSE_INIT. func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error { if r.opcode == linux.FUSE_INIT { - // TODO: process init response here. - // Maybe get the creds from the context? - // creds := auth.CredentialsFromContext(ctx) + creds := auth.CredentialsFromContext(ctx) + rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace() + return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs)) } return nil diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go index fcd77832a..1ffe7ccd2 100644 --- a/pkg/sentry/fsimpl/fuse/dev_test.go +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -60,25 +59,25 @@ func TestFUSECommunication(t *testing.T) { Name: "SingleClientSingleServer", NumClients: 1, NumServers: 1, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "SingleClientMultipleServers", NumClients: 1, NumServers: 10, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "MultipleClientsSingleServer", NumClients: 10, NumServers: 1, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "MultipleClientsMultipleServers", NumClients: 10, NumServers: 10, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "RequestCapacityFull", @@ -145,7 +144,7 @@ func TestFUSECommunication(t *testing.T) { // CallTest makes a request to the server and blocks the invoking // goroutine until a server responds with a response. Doesn't block // a kernel.Task. Analogous to Connection.Call but used for testing. -func CallTest(conn *Connection, t *kernel.Task, r *Request, i uint32) (*Response, error) { +func CallTest(conn *connection, t *kernel.Task, r *Request, i uint32) (*Response, error) { conn.fd.mu.Lock() // Wait until we're certain that a new request can be processed. @@ -214,7 +213,7 @@ func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem. // fuseClientRun emulates all the actions of a normal FUSE request. It creates // a header, a payload, calls the server, waits for the response, and processes // the response. -func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *Connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) { +func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) { defer func() { clientDone <- struct{}{} }() tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) @@ -343,7 +342,7 @@ func setup(t *testing.T) *testutil.System { AllowUserMount: true, }) - mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } @@ -353,16 +352,16 @@ func setup(t *testing.T) *testutil.System { // newTestConnection creates a fuse connection that the sentry can communicate with // and the FD for the server to communicate with. -func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*Connection, *vfs.FileDescription, error) { +func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) { vfsObj := &vfs.VirtualFilesystem{} fuseDev := &DeviceFD{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(system.Ctx); err != nil { return nil, nil, err } vd := vfsObj.NewAnonVirtualDentry("genCountFD") - defer vd.DecRef() + defer vd.DecRef(system.Ctx) if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, nil, err } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 911b6f7cb..a1405f7c3 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -65,7 +65,7 @@ type filesystem struct { // conn is used for communication between the FUSE server // daemon and the sentry fusefs. - conn *Connection + conn *connection // opts is the options the fusefs is initialized with. opts *filesystemOptions @@ -140,7 +140,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fsopts.rootMode = rootMode // Set the maxInFlightRequests option. - fsopts.maxActiveRequests = MaxActiveRequestsDefault + fsopts.maxActiveRequests = maxActiveRequestsDefault // Check for unparsed options. if len(mopts) != 0 { @@ -157,8 +157,12 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.VFSFilesystem().Init(vfsObj, &fsType, fs) - // TODO: dispatch a FUSE_INIT request to the FUSE daemon server before - // returning. Mount will not block on this dispatched request. + // Send a FUSE_INIT request to the FUSE daemon server before returning. + // This call is not blocking. + if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil { + log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err) + return nil, nil, err + } // root is the fusefs root directory. root := fs.newInode(creds, fsopts.rootMode) @@ -173,7 +177,7 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt opts: opts, } - conn, err := NewFUSEConnection(ctx, device, opts.maxActiveRequests) + conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests) if err != nil { log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) return nil, syserror.EINVAL @@ -187,13 +191,13 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } -// Inode implements kernfs.Inode. -type Inode struct { +// inode implements kernfs.Inode. +type inode struct { kernfs.InodeAttrs kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink @@ -206,7 +210,7 @@ type Inode struct { } func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { - i := &Inode{} + i := &inode{} i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) i.dentry.Init(i) @@ -215,7 +219,7 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke } // Open implements kernfs.Inode.Open. -func (i *Inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/init.go new file mode 100644 index 000000000..779c2bd3f --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/init.go @@ -0,0 +1,166 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// consts used by FUSE_INIT negotiation. +const ( + // fuseMaxMaxPages is the maximum value for MaxPages received in InitOut. + // Follow the same behavior as unix fuse implementation. + fuseMaxMaxPages = 256 + + // Maximum value for the time granularity for file time stamps, 1s. + // Follow the same behavior as unix fuse implementation. + fuseMaxTimeGranNs = 1000000000 + + // Minimum value for MaxWrite. + // Follow the same behavior as unix fuse implementation. + fuseMinMaxWrite = 4096 + + // Temporary default value for max readahead, 128kb. + fuseDefaultMaxReadahead = 131072 + + // The FUSE_INIT_IN flags sent to the daemon. + // TODO(gvisor.dev/issue/3199): complete the flags. + fuseDefaultInitFlags = linux.FUSE_MAX_PAGES +) + +// Adjustable maximums for Connection's cogestion control parameters. +// Used as the upperbound of the config values. +// Currently we do not support adjustment to them. +var ( + MaxUserBackgroundRequest uint16 = fuseDefaultMaxBackground + MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold +) + +// InitSend sends a FUSE_INIT request. +func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error { + in := linux.FUSEInitIn{ + Major: linux.FUSE_KERNEL_VERSION, + Minor: linux.FUSE_KERNEL_MINOR_VERSION, + // TODO(gvisor.dev/issue/3196): find appropriate way to calculate this + MaxReadahead: fuseDefaultMaxReadahead, + Flags: fuseDefaultInitFlags, + } + + req, err := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in) + if err != nil { + return err + } + + // Since there is no task to block on and FUSE_INIT is the request + // to unblock other requests, use nil. + return conn.CallAsync(nil, req) +} + +// InitRecv receives a FUSE_INIT reply and process it. +func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error { + if err := res.Error(); err != nil { + return err + } + + var out linux.FUSEInitOut + if err := res.UnmarshalPayload(&out); err != nil { + return err + } + + return conn.initProcessReply(&out, hasSysAdminCap) +} + +// Process the FUSE_INIT reply from the FUSE server. +func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error { + // No support for old major fuse versions. + if out.Major != linux.FUSE_KERNEL_VERSION { + conn.connInitError = true + + // Set the connection as initialized and unblock the blocked requests + // (i.e. return error for them). + conn.SetInitialized() + + return nil + } + + // Start processing the reply. + conn.connInitSuccess = true + conn.minor = out.Minor + + // No support for limits before minor version 13. + if out.Minor >= 13 { + conn.bgLock.Lock() + + if out.MaxBackground > 0 { + conn.maxBackground = out.MaxBackground + + if !hasSysAdminCap && + conn.maxBackground > MaxUserBackgroundRequest { + conn.maxBackground = MaxUserBackgroundRequest + } + } + + if out.CongestionThreshold > 0 { + conn.congestionThreshold = out.CongestionThreshold + + if !hasSysAdminCap && + conn.congestionThreshold > MaxUserCongestionThreshold { + conn.congestionThreshold = MaxUserCongestionThreshold + } + } + + conn.bgLock.Unlock() + } + + // No support for the following flags before minor version 6. + if out.Minor >= 6 { + conn.asyncRead = out.Flags&linux.FUSE_ASYNC_READ != 0 + conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0 + conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0 + conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0 + conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0 + conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0 + + // TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs). + + if out.Flags&linux.FUSE_MAX_PAGES != 0 { + maxPages := out.MaxPages + if maxPages < 1 { + maxPages = 1 + } + if maxPages > fuseMaxMaxPages { + maxPages = fuseMaxMaxPages + } + conn.maxPages = maxPages + } + } + + // No support for negotiating MaxWrite before minor version 5. + if out.Minor >= 5 { + conn.maxWrite = out.MaxWrite + } else { + conn.maxWrite = fuseMinMaxWrite + } + if conn.maxWrite < fuseMinMaxWrite { + conn.maxWrite = fuseMinMaxWrite + } + + // Set connection as initialized and unblock the requests + // issued before init. + conn.SetInitialized() + + return nil +} diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 4a800dcf9..16787116f 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -85,5 +85,6 @@ go_test( deps = [ "//pkg/p9", "//pkg/sentry/contexttest", + "//pkg/sentry/pgalloc", ], ) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 8c7c8e1b3..2a8011eb4 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -90,10 +90,8 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { uid: uint32(opts.kuid), gid: uint32(opts.kgid), blockSize: usermem.PageSize, // arbitrary - handle: handle{ - fd: -1, - }, - nlink: uint32(2), + hostFD: -1, + nlink: uint32(2), } switch opts.mode.FileType() { case linux.S_IFDIR: @@ -122,7 +120,7 @@ type directoryFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(context.Context) { } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. @@ -139,7 +137,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.dirents = ds } - d.InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent) if d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } @@ -205,14 +203,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { off := uint64(0) const count = 64 * 1024 // for consistency with the vfs1 client d.handleMu.RLock() - if !d.handleReadable { + if d.readFile.isNil() { // This should not be possible because a readable handle should // have been opened when the calling directoryFD was opened. d.handleMu.RUnlock() panic("gofer.dentry.getDirents called without a readable handle") } for { - p9ds, err := d.handle.file.readdir(ctx, off, count) + p9ds, err := d.readFile.readdir(ctx, off, count) if err != nil { d.handleMu.RUnlock() return nil, err @@ -304,5 +302,5 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *directoryFD) Sync(ctx context.Context) error { - return fd.dentry().handle.sync(ctx) + return fd.dentry().syncRemoteFile(ctx) } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 00e3c99cd..eaef2594d 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -15,6 +15,7 @@ package gofer import ( + "math" "sync" "sync/atomic" @@ -54,8 +55,8 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync regular files. for _, d := range ds { - err := d.syncSharedHandle(ctx) - d.DecRef() + err := d.syncCachedFile(ctx) + d.DecRef(ctx) if err != nil && retErr == nil { retErr = err } @@ -65,7 +66,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // handles (so they won't be synced by the above). for _, sffd := range sffds { err := sffd.Sync(ctx) - sffd.vfsfd.DecRef() + sffd.vfsfd.DecRef(ctx) if err != nil && retErr == nil { retErr = err } @@ -133,7 +134,7 @@ afterSymlink: return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() @@ -146,7 +147,7 @@ afterSymlink: // // Call rp.CheckMount() before updating d.parent's metadata, since if // we traverse to another mount then d.parent's metadata is irrelevant. - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } if d != d.parent && !d.cachedMetadataAuthoritative() { @@ -164,7 +165,7 @@ afterSymlink: if child == nil { return nil, syserror.ENOENT } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { @@ -239,7 +240,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // has 0 references, drop it). Wait to update parent.children until we // know what to replace the existing dentry with (i.e. one of the // returns below), to avoid a redundant map access. - vfsObj.InvalidateDentry(&child.vfsd) + vfsObj.InvalidateDentry(ctx, &child.vfsd) if child.isSynthetic() { // Normally we don't mark invalidated dentries as deleted since // they may still exist (but at a different path), and also for @@ -332,7 +333,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -384,7 +385,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } if fs.opts.interop == InteropModeShared { @@ -405,7 +406,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } if child := parent.children[name]; child != nil { @@ -426,7 +427,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } @@ -434,7 +435,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -470,7 +471,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() @@ -600,17 +601,17 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // Generate inotify events for rmdir or unlink. if dir { - parent.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) + parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) } else { var cw *vfs.Watches if child != nil { cw = &child.watches } - vfs.InotifyRemoveChild(cw, &parent.watches, name) + vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name) } if child != nil { - vfsObj.CommitDeleteDentry(&child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) child.setDeleted() if child.isSynthetic() { parent.syntheticChildren-- @@ -637,7 +638,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. -func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { return @@ -645,20 +646,20 @@ func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { if len(**ds) != 0 { fs.renameMu.Lock() for _, d := range **ds { - d.checkCachingLocked() + d.checkCachingLocked(ctx) } fs.renameMu.Unlock() } putDentrySlice(*ds) } -func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { +func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() return } for _, d := range **ds { - d.checkCachingLocked() + d.checkCachingLocked(ctx) } fs.renameMu.Unlock() putDentrySlice(*ds) @@ -668,7 +669,7 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -680,7 +681,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -701,7 +702,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -724,8 +725,29 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. if rp.Mount() != vd.Mount() { return syserror.EXDEV } - // 9P2000.L supports hard links, but we don't. - return syserror.EPERM + d := vd.Dentry().Impl().(*dentry) + if d.isDir() { + return syserror.EPERM + } + gid := auth.KGID(atomic.LoadUint32(&d.gid)) + uid := auth.KUID(atomic.LoadUint32(&d.uid)) + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil { + return err + } + if d.nlink == 0 { + return syserror.ENOENT + } + if d.nlink == math.MaxUint32 { + return syserror.EMLINK + } + if err := parent.file.link(ctx, d.file, childName); err != nil { + return err + } + + // Success! + atomic.AddUint32(&d.nlink, 1) + return nil }, nil) } @@ -812,7 +834,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { @@ -1085,12 +1107,18 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD if useRegularFileFD { child.handleMu.Lock() - child.handle.file = openFile - if fdobj != nil { - child.handle.fd = int32(fdobj.Release()) + if vfs.MayReadFileWithOpenFlags(opts.Flags) { + child.readFile = openFile + if fdobj != nil { + child.hostFD = int32(fdobj.Release()) + } + } else if fdobj != nil { + // Can't use fdobj if it's not readable. + fdobj.Close() + } + if vfs.MayWriteFileWithOpenFlags(opts.Flags) { + child.writeFile = openFile } - child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags) - child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags) child.handleMu.Unlock() } // Insert the dentry into the tree. @@ -1126,7 +1154,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } - d.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) + d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) return childVFSFD, nil } @@ -1134,7 +1162,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1154,7 +1182,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa var ds *[]*dentry fs.renameMu.Lock() - defer fs.renameMuUnlockAndCheckCaching(&ds) + defer fs.renameMuUnlockAndCheckCaching(ctx, &ds) newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) if err != nil { return err @@ -1244,7 +1272,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return nil } mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } @@ -1269,7 +1297,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } // Update the dentry tree. - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) if replaced != nil { replaced.setDeleted() if replaced.isSynthetic() { @@ -1331,17 +1359,17 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -1350,7 +1378,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err @@ -1367,7 +1395,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statfs{}, err @@ -1417,7 +1445,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1443,7 +1471,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1455,7 +1483,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1469,16 +1497,16 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -1488,16 +1516,16 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } if err := d.removexattr(ctx, rp.Credentials(), name); err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index e20de84b5..4ac8dd81d 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -192,10 +192,14 @@ const ( // // - File timestamps are based on client clocks. This ensures that users of // the client observe timestamps that are coherent with their own clocks - // and consistent with Linux's semantics. However, since it is not always - // possible for clients to set arbitrary atimes and mtimes, and never - // possible for clients to set arbitrary ctimes, file timestamp changes are - // stored in the client only and never sent to the remote filesystem. + // and consistent with Linux's semantics (in particular, it is not always + // possible for clients to set arbitrary atimes and mtimes depending on the + // remote filesystem implementation, and never possible for clients to set + // arbitrary ctimes.) If a dentry containing a client-defined atime or + // mtime is evicted from cache, client timestamps will be sent to the + // remote filesystem on a best-effort basis to attempt to ensure that + // timestamps will be preserved when another dentry representing the same + // file is instantiated. InteropModeExclusive InteropMode = iota // InteropModeWritethrough is appropriate when there are read-only users of @@ -482,7 +486,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) if err != nil { attachFile.close(ctx) - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } // Set the root's reference count to 2. One reference is returned to the @@ -495,17 +499,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { - ctx := context.Background() +func (fs *filesystem) Release(ctx context.Context) { mf := fs.mfp.MemoryFile() fs.syncMu.Lock() for d := range fs.syncableDentries { d.handleMu.Lock() d.dataMu.Lock() - if d.handleWritable { + if h := d.writeHandleLocked(); h.isOpen() { // Write dirty cached data to the remote file. - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), h.writeFromBlocksAt); err != nil { log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) } // TODO(jamieliu): Do we need to flushf/fsync d? @@ -515,9 +518,9 @@ func (fs *filesystem) Release() { d.dirty.RemoveAll() d.dataMu.Unlock() // Close the host fd if one exists. - if d.handle.fd >= 0 { - syscall.Close(int(d.handle.fd)) - d.handle.fd = -1 + if d.hostFD >= 0 { + syscall.Close(int(d.hostFD)) + d.hostFD = -1 } d.handleMu.Unlock() } @@ -559,8 +562,6 @@ type dentry struct { // filesystem.renameMu. name string - // We don't support hard links, so each dentry maps 1:1 to an inode. - // file is the unopened p9.File that backs this dentry. file is immutable. // // If file.isNil(), this dentry represents a synthetic file, i.e. a file @@ -621,9 +622,20 @@ type dentry struct { mtime int64 ctime int64 btime int64 - // File size, protected by both metadataMu and dataMu (i.e. both must be + // File size, which differs from other metadata in two ways: + // + // - We make a best-effort attempt to keep it up to date even if + // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes. + // + // - size is protected by both metadataMu and dataMu (i.e. both must be // locked to mutate it; locking either is sufficient to access it). size uint64 + // If this dentry does not represent a synthetic file, deleted is 0, and + // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the + // remote file's timestamps, which should be updated when this dentry is + // evicted. + atimeDirty uint32 + mtimeDirty uint32 // nlink counts the number of hard links to this dentry. It's updated and // accessed using atomic operations. It's not protected by metadataMu like the @@ -636,30 +648,28 @@ type dentry struct { // the file into memmap.MappingSpaces. mappings is protected by mapsMu. mappings memmap.MappingSet - // If this dentry represents a regular file or directory: - // - // - handle is the I/O handle used by all regularFileFDs/directoryFDs - // representing this dentry. - // - // - handleReadable is true if handle is readable. + // - If this dentry represents a regular file or directory, readFile is the + // p9.File used for reads by all regularFileFDs/directoryFDs representing + // this dentry. // - // - handleWritable is true if handle is writable. + // - If this dentry represents a regular file, writeFile is the p9.File + // used for writes by all regularFileFDs representing this dentry. // - // Invariants: - // - // - If handleReadable == handleWritable == false, then handle.file == nil - // (i.e. there is no open handle). Conversely, if handleReadable || - // handleWritable == true, then handle.file != nil (i.e. there is an open - // handle). - // - // - handleReadable and handleWritable cannot transition from true to false - // (i.e. handles may not be downgraded). + // - If this dentry represents a regular file, hostFD is the host FD used + // for memory mappings and I/O (when applicable) in preference to readFile + // and writeFile. hostFD is always readable; if !writeFile.isNil(), it must + // also be writable. If hostFD is -1, no such host FD is available. // // These fields are protected by handleMu. - handleMu sync.RWMutex - handle handle - handleReadable bool - handleWritable bool + // + // readFile and writeFile may or may not represent the same p9.File. Once + // either p9.File transitions from closed (isNil() == true) to open + // (isNil() == false), it may be mutated with handleMu locked, but cannot + // be closed until the dentry is destroyed. + handleMu sync.RWMutex + readFile p9file + writeFile p9file + hostFD int32 dataMu sync.RWMutex @@ -673,7 +683,7 @@ type dentry struct { // tracks dirty segments in cache. dirty is protected by dataMu. dirty fsutil.DirtySet - // pf implements platform.File for mappings of handle.fd. + // pf implements platform.File for mappings of hostFD. pf dentryPlatformFile // If this dentry represents a symbolic link, InteropModeShared is not in @@ -735,9 +745,7 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma uid: uint32(fs.opts.dfltuid), gid: uint32(fs.opts.dfltgid), blockSize: usermem.PageSize, - handle: handle{ - fd: -1, - }, + hostFD: -1, } d.pf.dentry = d if mask.UID { @@ -804,10 +812,12 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { if attr.BlockSize != 0 { atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize)) } - if mask.ATime { + // Don't override newer client-defined timestamps with old server-defined + // ones. + if mask.ATime && atomic.LoadUint32(&d.atimeDirty) == 0 { atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)) } - if mask.MTime { + if mask.MTime && atomic.LoadUint32(&d.mtimeDirty) == 0 { atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)) } if mask.CTime { @@ -826,9 +836,13 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { // Preconditions: !d.isSynthetic() func (d *dentry) updateFromGetattr(ctx context.Context) error { - // Use d.handle.file, which represents a 9P fid that has been opened, in - // preference to d.file, which represents a 9P fid that has not. This may - // be significantly more efficient in some implementations. + // Use d.readFile or d.writeFile, which represent 9P fids that have been + // opened, in preference to d.file, which represents a 9P fid that has not. + // This may be significantly more efficient in some implementations. Prefer + // d.writeFile over d.readFile since some filesystem implementations may + // update a writable handle's metadata after writes to that handle, without + // making metadata updates immediately visible to read-only handles + // representing the same file. var ( file p9file handleMuRLocked bool @@ -838,8 +852,11 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { d.metadataMu.Lock() defer d.metadataMu.Unlock() d.handleMu.RLock() - if !d.handle.file.isNil() { - file = d.handle.file + if !d.writeFile.isNil() { + file = d.writeFile + handleMuRLocked = true + } else if !d.readFile.isNil() { + file = d.readFile handleMuRLocked = true } else { file = d.file @@ -904,51 +921,44 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return err } defer mnt.EndWrite() - setLocalAtime := false - setLocalMtime := false + + if stat.Mask&linux.STATX_SIZE != 0 { + // Reject attempts to truncate files other than regular files, since + // filesystem implementations may return the wrong errno. + switch mode.FileType() { + case linux.S_IFREG: + // ok + case linux.S_IFDIR: + return syserror.EISDIR + default: + return syserror.EINVAL + } + } + + var now int64 if d.cachedMetadataAuthoritative() { - // Timestamp updates will be handled locally. - setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 - setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 - stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME - - // Prepare for truncate. - if stat.Mask&linux.STATX_SIZE != 0 { - switch mode.FileType() { - case linux.ModeRegular: - if !setLocalMtime { - // Truncate updates mtime. - setLocalMtime = true - stat.Mtime.Nsec = linux.UTIME_NOW - } - case linux.ModeDirectory: - return syserror.EISDIR - default: - return syserror.EINVAL + // Truncate updates mtime. + if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE { + stat.Mask |= linux.STATX_MTIME + stat.Mtime = linux.StatxTimestamp{ + Nsec: linux.UTIME_NOW, } } + + // Use client clocks for timestamps. + now = d.fs.clock.Now().Nanoseconds() + if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW { + stat.Atime = statxTimestampFromDentry(now) + } + if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW { + stat.Mtime = statxTimestampFromDentry(now) + } } + d.metadataMu.Lock() defer d.metadataMu.Unlock() - if stat.Mask&linux.STATX_SIZE != 0 { - // The size needs to be changed even when - // !d.cachedMetadataAuthoritative() because d.mappings has to be - // updated. - d.updateFileSizeLocked(stat.Size) - } if !d.isSynthetic() { if stat.Mask != 0 { - if stat.Mask&linux.STATX_SIZE != 0 { - // Check whether to allow a truncate request to be made. - switch d.mode & linux.S_IFMT { - case linux.S_IFREG: - // Allow. - case linux.S_IFDIR: - return syserror.EISDIR - default: - return syserror.EINVAL - } - } if err := d.file.setAttr(ctx, p9.SetAttrMask{ Permissions: stat.Mask&linux.STATX_MODE != 0, UID: stat.Mask&linux.STATX_UID != 0, @@ -970,6 +980,12 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs }); err != nil { return err } + if stat.Mask&linux.STATX_SIZE != 0 { + // d.size should be kept up to date, and privatized + // copy-on-write mappings of truncated pages need to be + // invalidated, even if InteropModeShared is in effect. + d.updateFileSizeLocked(stat.Size) + } } if d.fs.opts.interop == InteropModeShared { // There's no point to updating d's metadata in this case since @@ -979,7 +995,6 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return nil } } - now := d.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_MODE != 0 { atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) } @@ -989,23 +1004,18 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs if stat.Mask&linux.STATX_GID != 0 { atomic.StoreUint32(&d.gid, stat.GID) } - if setLocalAtime { - if stat.Atime.Nsec == linux.UTIME_NOW { - atomic.StoreInt64(&d.atime, now) - } else { - atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) - } - // Restore mask bits that we cleared earlier. - stat.Mask |= linux.STATX_ATIME + // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because + // if d.cachedMetadataAuthoritative() then we converted stat.Atime and + // stat.Mtime to client-local timestamps above, and if + // !d.cachedMetadataAuthoritative() then we returned after calling + // d.file.setAttr(). For the same reason, now must have been initialized. + if stat.Mask&linux.STATX_ATIME != 0 { + atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) + atomic.StoreUint32(&d.atimeDirty, 0) } - if setLocalMtime { - if stat.Mtime.Nsec == linux.UTIME_NOW { - atomic.StoreInt64(&d.mtime, now) - } else { - atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) - } - // Restore mask bits that we cleared earlier. - stat.Mask |= linux.STATX_MTIME + if stat.Mask&linux.STATX_MTIME != 0 { + atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) + atomic.StoreUint32(&d.mtimeDirty, 0) } atomic.StoreInt64(&d.ctime, now) return nil @@ -1089,10 +1099,10 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { +func (d *dentry) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { d.fs.renameMu.Lock() - d.checkCachingLocked() + d.checkCachingLocked(ctx) d.fs.renameMu.Unlock() } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") @@ -1109,7 +1119,7 @@ func (d *dentry) decRefLocked() { } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { if d.isDir() { events |= linux.IN_ISDIR } @@ -1117,9 +1127,9 @@ func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { d.fs.renameMu.RLock() // The ordering below is important, Linux always notifies the parent first. if d.parent != nil { - d.parent.watches.Notify(d.name, events, cookie, et, d.isDeleted()) + d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) } - d.watches.Notify("", events, cookie, et, d.isDeleted()) + d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) d.fs.renameMu.RUnlock() } @@ -1131,10 +1141,10 @@ func (d *dentry) Watches() *vfs.Watches { // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. // // If no watches are left on this dentry and it has no references, cache it. -func (d *dentry) OnZeroWatches() { +func (d *dentry) OnZeroWatches(ctx context.Context) { if atomic.LoadInt64(&d.refs) == 0 { d.fs.renameMu.Lock() - d.checkCachingLocked() + d.checkCachingLocked(ctx) d.fs.renameMu.Unlock() } } @@ -1149,7 +1159,7 @@ func (d *dentry) OnZeroWatches() { // do nothing. // // Preconditions: d.fs.renameMu must be locked for writing. -func (d *dentry) checkCachingLocked() { +func (d *dentry) checkCachingLocked(ctx context.Context) { // Dentries with a non-zero reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will @@ -1171,14 +1181,14 @@ func (d *dentry) checkCachingLocked() { // reachable by path resolution and should be dropped immediately. if d.vfsd.IsDead() { if d.isDeleted() { - d.watches.HandleDeletion() + d.watches.HandleDeletion(ctx) } if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- d.cached = false } - d.destroyLocked() + d.destroyLocked(ctx) return } // If d still has inotify watches and it is not deleted or invalidated, we @@ -1213,7 +1223,7 @@ func (d *dentry) checkCachingLocked() { if !victim.vfsd.IsDead() { // Note that victim can't be a mount point (in any mount // namespace), since VFS holds references on mount points. - d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd) + d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd) delete(victim.parent.children, victim.name) // We're only deleting the dentry, not the file it // represents, so we don't need to update @@ -1221,7 +1231,7 @@ func (d *dentry) checkCachingLocked() { } victim.parent.dirMu.Unlock() } - victim.destroyLocked() + victim.destroyLocked(ctx) } // Whether or not victim was destroyed, we brought fs.cachedDentriesLen // back down to fs.opts.maxCachedDentries, so we don't loop. @@ -1233,7 +1243,7 @@ func (d *dentry) checkCachingLocked() { // // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is // not a child dentry. -func (d *dentry) destroyLocked() { +func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: // Mark the dentry destroyed. @@ -1244,27 +1254,60 @@ func (d *dentry) destroyLocked() { panic("dentry.destroyLocked() called with references on the dentry") } - ctx := context.Background() + mf := d.fs.mfp.MemoryFile() d.handleMu.Lock() - if !d.handle.file.isNil() { - mf := d.fs.mfp.MemoryFile() - d.dataMu.Lock() + d.dataMu.Lock() + if h := d.writeHandleLocked(); h.isOpen() { // Write dirty pages back to the remote filesystem. - if d.handleWritable { - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { - log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err) - } + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { + log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) } - // Discard cached data. + } + // Discard cached data. + if !d.cache.IsEmpty() { + mf.MarkAllUnevictable(d) d.cache.DropAll(mf) d.dirty.RemoveAll() - d.dataMu.Unlock() - // Clunk open fids and close open host FDs. - d.handle.close(ctx) + } + d.dataMu.Unlock() + // Clunk open fids and close open host FDs. + if !d.readFile.isNil() { + d.readFile.close(ctx) + } + if !d.writeFile.isNil() && d.readFile != d.writeFile { + d.writeFile.close(ctx) + } + d.readFile = p9file{} + d.writeFile = p9file{} + if d.hostFD >= 0 { + syscall.Close(int(d.hostFD)) + d.hostFD = -1 } d.handleMu.Unlock() if !d.file.isNil() { + if !d.isDeleted() { + // Write dirty timestamps back to the remote filesystem. + atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0 + mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0 + if atimeDirty || mtimeDirty { + atime := atomic.LoadInt64(&d.atime) + mtime := atomic.LoadInt64(&d.mtime) + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + ATime: atimeDirty, + ATimeNotSystemTime: atimeDirty, + MTime: mtimeDirty, + MTimeNotSystemTime: mtimeDirty, + }, p9.SetAttr{ + ATimeSeconds: uint64(atime / 1e9), + ATimeNanoSeconds: uint64(atime % 1e9), + MTimeSeconds: uint64(mtime / 1e9), + MTimeNanoSeconds: uint64(mtime % 1e9), + }); err != nil { + log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err) + } + } + } d.file.close(ctx) d.file = p9file{} // Remove d from the set of syncable dentries. @@ -1276,7 +1319,7 @@ func (d *dentry) destroyLocked() { // d.fs.renameMu. if d.parent != nil { if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { - d.parent.checkCachingLocked() + d.parent.checkCachingLocked(ctx) } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") } @@ -1371,80 +1414,120 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // O_TRUNC). if !trunc { d.handleMu.RLock() - if (!read || d.handleReadable) && (!write || d.handleWritable) { - // The current handle is sufficient. + if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) { + // Current handles are sufficient. d.handleMu.RUnlock() return nil } d.handleMu.RUnlock() } - haveOldFD := false + fdToClose := int32(-1) + invalidateTranslations := false d.handleMu.Lock() - if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc { - // Get a new handle. - wantReadable := d.handleReadable || read - wantWritable := d.handleWritable || write - h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc) + if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc { + // Get a new handle. If this file has been opened for both reading and + // writing, try to get a single handle that is usable for both: + // + // - Writable memory mappings of a host FD require that the host FD is + // opened for both reading and writing. + // + // - NOTE(b/141991141): Some filesystems may not ensure coherence + // between multiple handles for the same file. + openReadable := !d.readFile.isNil() || read + openWritable := !d.writeFile.isNil() || write + h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc) + if err == syserror.EACCES && (openReadable != read || openWritable != write) { + // It may not be possible to use a single handle for both + // reading and writing, since permissions on the file may have + // changed to e.g. disallow reading after previously being + // opened for reading. In this case, we have no choice but to + // use separate handles for reading and writing. + ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d) + openReadable = read + openWritable = write + h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc) + } if err != nil { d.handleMu.Unlock() return err } - if !d.handle.file.isNil() { - // Check that old and new handles are compatible: If the old handle - // includes a host file descriptor but the new one does not, or - // vice versa, old and new memory mappings may be incoherent. - haveOldFD = d.handle.fd >= 0 - haveNewFD := h.fd >= 0 - if haveOldFD != haveNewFD { - d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD) - h.close(ctx) - return syserror.EIO - } - if haveOldFD { - // We may have raced with callers of d.pf.FD() that are now - // using the old file descriptor, preventing us from safely - // closing it. We could handle this by invalidating existing - // memmap.Translations, but this is expensive. Instead, use - // dup3 to make the old file descriptor refer to the new file - // description, then close the new file descriptor (which is no - // longer needed). Racing callers may use the old or new file - // description, but this doesn't matter since they refer to the - // same file (unless d.fs.opts.overlayfsStaleRead is true, - // which we handle separately). - if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil { + + if d.hostFD < 0 && openReadable && h.fd >= 0 { + // We have no existing FD; use the new FD for at least reading. + d.hostFD = h.fd + } else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable { + // We have an existing read-only FD, but the file has just been + // opened for writing, so we need to start supporting writable memory + // mappings. This may race with callers of d.pf.FD() using the existing + // FD, so in most cases we need to delay closing the old FD until after + // invalidating memmap.Translations that might have observed it. + if !openReadable || h.fd < 0 { + // We don't have a read/write FD, so we have no FD that can be + // used to create writable memory mappings. Switch to using the + // internal page cache. + invalidateTranslations = true + fdToClose = d.hostFD + d.hostFD = -1 + } else if d.fs.opts.overlayfsStaleRead { + // We do have a read/write FD, but it may not be coherent with + // the existing read-only FD, so we must switch to mappings of + // the new FD in both the application and sentry. + if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) h.close(ctx) return err } - syscall.Close(int(h.fd)) - h.fd = d.handle.fd - if d.fs.opts.overlayfsStaleRead { - // Replace sentry mappings of the old FD with mappings of - // the new FD, since the two are not necessarily coherent. - if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { - d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) - h.close(ctx) - return err - } + invalidateTranslations = true + fdToClose = d.hostFD + d.hostFD = h.fd + } else { + // We do have a read/write FD. To avoid invalidating existing + // memmap.Translations (which is expensive), use dup3 to make + // the old file descriptor refer to the new file description, + // then close the new file descriptor (which is no longer + // needed). Racing callers of d.pf.FD() may use the old or new + // file description, but this doesn't matter since they refer + // to the same file, and any racing mappings must be read-only. + if err := syscall.Dup3(int(h.fd), int(d.hostFD), syscall.O_CLOEXEC); err != nil { + oldHostFD := d.hostFD + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldHostFD, err) + h.close(ctx) + return err } - // Clunk the old fid before making the new handle visible (by - // unlocking d.handleMu). - d.handle.file.close(ctx) + fdToClose = h.fd } + } else { + // h.fd is not useful. + fdToClose = h.fd + } + + // Switch to new fids. + var oldReadFile p9file + if openReadable { + oldReadFile = d.readFile + d.readFile = h.file + } + var oldWriteFile p9file + if openWritable { + oldWriteFile = d.writeFile + d.writeFile = h.file + } + // NOTE(b/141991141): Clunk old fids before making new fids visible (by + // unlocking d.handleMu). + if !oldReadFile.isNil() { + oldReadFile.close(ctx) + } + if !oldWriteFile.isNil() && oldReadFile != oldWriteFile { + oldWriteFile.close(ctx) } - // Switch to the new handle. - d.handle = h - d.handleReadable = wantReadable - d.handleWritable = wantWritable } d.handleMu.Unlock() - if d.fs.opts.overlayfsStaleRead && haveOldFD { - // Invalidate application mappings that may be using the old FD; they + if invalidateTranslations { + // Invalidate application mappings that may be using an old FD; they // will be replaced with mappings using the new FD after future calls // to d.Translate(). This requires holding d.mapsMu, which precedes // d.handleMu in the lock order. @@ -1452,7 +1535,51 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool d.mappings.InvalidateAll(memmap.InvalidateOpts{}) d.mapsMu.Unlock() } + if fdToClose >= 0 { + syscall.Close(int(fdToClose)) + } + + return nil +} + +// Preconditions: d.handleMu must be locked. +func (d *dentry) readHandleLocked() handle { + return handle{ + file: d.readFile, + fd: d.hostFD, + } +} + +// Preconditions: d.handleMu must be locked. +func (d *dentry) writeHandleLocked() handle { + return handle{ + file: d.writeFile, + fd: d.hostFD, + } +} + +func (d *dentry) syncRemoteFile(ctx context.Context) error { + d.handleMu.RLock() + defer d.handleMu.RUnlock() + return d.syncRemoteFileLocked(ctx) +} +// Preconditions: d.handleMu must be locked. +func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { + // If we have a host FD, fsyncing it is likely to be faster than an fsync + // RPC. + if d.hostFD >= 0 { + ctx.UninterruptibleSleepStart(false) + err := syscall.Fsync(int(d.hostFD)) + ctx.UninterruptibleSleepFinish(false) + return err + } + if !d.writeFile.isNil() { + return d.writeFile.fsync(ctx) + } + if !d.readFile.isNil() { + return d.readFile.fsync(ctx) + } return nil } @@ -1514,7 +1641,7 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) return err } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - fd.dentry().InotifyWithParent(ev, 0, vfs.InodeEvent) + fd.dentry().InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -1535,7 +1662,7 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil { return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -1545,7 +1672,7 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil { return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index adff39490..36cca3625 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -20,10 +20,13 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" ) func TestDestroyIdempotent(t *testing.T) { + ctx := contexttest.Context(t) fs := filesystem{ + mfp: pgalloc.MemoryFileProviderFromContext(ctx), syncableDentries: make(map[*dentry]struct{}), opts: filesystemOptions{ // Test relies on no dentry being held in the cache. @@ -31,7 +34,6 @@ func TestDestroyIdempotent(t *testing.T) { }, } - ctx := contexttest.Context(t) attr := &p9.Attr{ Mode: p9.ModeRegular, } @@ -50,7 +52,7 @@ func TestDestroyIdempotent(t *testing.T) { } parent.cacheNewChildLocked(child, "child") - child.checkCachingLocked() + child.checkCachingLocked(ctx) if got := atomic.LoadInt64(&child.refs); got != -1 { t.Fatalf("child.refs=%d, want: -1", got) } @@ -58,6 +60,6 @@ func TestDestroyIdempotent(t *testing.T) { if got := atomic.LoadInt64(&parent.refs); got != -1 { t.Fatalf("parent.refs=%d, want: -1", got) } - child.checkCachingLocked() - child.checkCachingLocked() + child.checkCachingLocked(ctx) + child.checkCachingLocked(ctx) } diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index 8792ca4f2..104157512 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -63,6 +63,10 @@ func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (hand }, nil } +func (h *handle) isOpen() bool { + return !h.file.isNil() +} + func (h *handle) close(ctx context.Context) { h.file.close(ctx) h.file = p9file{} @@ -124,18 +128,3 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o } return cp, cperr } - -func (h *handle) sync(ctx context.Context) error { - // Handle most common case first. - if h.fd >= 0 { - ctx.UninterruptibleSleepStart(false) - err := syscall.Fsync(int(h.fd)) - ctx.UninterruptibleSleepFinish(false) - return err - } - if h.file.isNil() { - // File hasn't been touched, there is nothing to sync. - return nil - } - return h.file.fsync(ctx) -} diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 09f142cfc..7e1cbf065 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -48,7 +48,7 @@ type regularFileFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() { +func (fd *regularFileFD) Release(context.Context) { } // OnClose implements vfs.FileDescriptionImpl.OnClose. @@ -64,34 +64,34 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { } d.handleMu.RLock() defer d.handleMu.RUnlock() - return d.handle.file.flush(ctx) + if d.writeFile.isNil() { + return nil + } + return d.writeFile.flush(ctx) } // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { - d := fd.dentry() d.metadataMu.Lock() defer d.metadataMu.Unlock() - size := offset + length - // Allocating a smaller size is a noop. - if size <= d.size { + size := offset + length + if d.cachedMetadataAuthoritative() && size <= d.size { return nil } - d.handleMu.Lock() - defer d.handleMu.Unlock() - - err := d.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + d.handleMu.RLock() + err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + d.handleMu.RUnlock() if err != nil { return err } d.dataMu.Lock() atomic.StoreUint64(&d.size, size) d.dataMu.Unlock() - if !d.cachedMetadataAuthoritative() { + if d.cachedMetadataAuthoritative() { d.touchCMtimeLocked() } return nil @@ -113,7 +113,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // Check for reading at EOF before calling into MM (but not under // InteropModeShared, which makes d.size unreliable). d := fd.dentry() - if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) { + if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) { return 0, io.EOF } @@ -184,6 +184,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off d.metadataMu.Lock() defer d.metadataMu.Unlock() + // Set offset to file size if the fd was opened with O_APPEND. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { // Holding d.metadataMu is sufficient for reading d.size. @@ -194,70 +195,86 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off return 0, offset, err } src = src.TakeFirst64(limit) - n, err := fd.pwriteLocked(ctx, src, offset, opts) - return n, offset + n, err -} -// Preconditions: fd.dentry().metatdataMu must be locked. -func (fd *regularFileFD) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - d := fd.dentry() if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking // d.metadataMu (recursively). d.touchCMtimeLocked() } - if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { - // Write dirty cached pages that will be touched by the write back to - // the remote file. - if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { - return 0, err - } - // Remove touched pages from the cache. - pgstart := usermem.PageRoundDown(uint64(offset)) - pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes())) - if !ok { - return 0, syserror.EINVAL - } - mr := memmap.MappableRange{pgstart, pgend} - var freed []memmap.FileRange - d.dataMu.Lock() - cseg := d.cache.LowerBoundSegment(mr.Start) - for cseg.Ok() && cseg.Start() < mr.End { - cseg = d.cache.Isolate(cseg, mr) - freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) - cseg = d.cache.Remove(cseg).NextSegment() - } - d.dataMu.Unlock() - // Invalidate mappings of removed pages. - d.mapsMu.Lock() - d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) - d.mapsMu.Unlock() - // Finally free pages removed from the cache. - mf := d.fs.mfp.MemoryFile() - for _, freedFR := range freed { - mf.DecRef(freedFR) - } - } + rw := getDentryReadWriter(ctx, d, offset) + defer putDentryReadWriter(rw) + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + if err := fd.writeCache(ctx, d, offset, src); err != nil { + return 0, offset, err + } + // Require the write to go to the remote file. rw.direct = true } + n, err := src.CopyInTo(ctx, rw) - putDentryReadWriter(rw) - if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + if err != nil { + return n, offset + n, err + } + if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + // Note that if any of the following fail, then we can't guarantee that + // any data was actually written with the semantics of O_DSYNC or + // O_SYNC, so we return zero bytes written. Compare Linux's + // mm/filemap.c:generic_file_write_iter() => + // include/linux/fs.h:generic_write_sync(). + // // Write dirty cached pages touched by the write back to the remote // file. if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { - return 0, err + return 0, offset, err } // Request the remote filesystem to sync the remote file. - if err := d.handle.file.fsync(ctx); err != nil { - return 0, err + if err := d.syncRemoteFile(ctx); err != nil { + return 0, offset, err } } - return n, err + return n, offset + n, nil +} + +func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error { + // Write dirty cached pages that will be touched by the write back to + // the remote file. + if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { + return err + } + + // Remove touched pages from the cache. + pgstart := usermem.PageRoundDown(uint64(offset)) + pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes())) + if !ok { + return syserror.EINVAL + } + mr := memmap.MappableRange{pgstart, pgend} + var freed []memmap.FileRange + + d.dataMu.Lock() + cseg := d.cache.LowerBoundSegment(mr.Start) + for cseg.Ok() && cseg.Start() < mr.End { + cseg = d.cache.Isolate(cseg, mr) + freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) + cseg = d.cache.Remove(cseg).NextSegment() + } + d.dataMu.Unlock() + + // Invalidate mappings of removed pages. + d.mapsMu.Lock() + d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) + d.mapsMu.Unlock() + + // Finally free pages removed from the cache. + mf := d.fs.mfp.MemoryFile() + for _, freedFR := range freed { + mf.DecRef(freedFR) + } + return nil } // Write implements vfs.FileDescriptionImpl.Write. @@ -307,10 +324,11 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) // coherence with memory-mapped I/O), or if InteropModeShared is in effect // (which prevents us from caching file contents and makes dentry.size // unreliable), or if the file was opened O_DIRECT, read directly from - // dentry.handle without locking dentry.dataMu. + // dentry.readHandleLocked() without locking dentry.dataMu. rw.d.handleMu.RLock() - if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { - n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off) + h := rw.d.readHandleLocked() + if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off) rw.d.handleMu.RUnlock() rw.off += n return n, err @@ -378,7 +396,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) End: gapEnd, } optMR := gap.Range() - err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) + err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, h.readToBlocksAt) mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) seg, gap = rw.d.cache.Find(rw.off) if !seg.Ok() { @@ -393,7 +411,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) } else { // Read directly from the file. gapDsts := dsts.TakeFirst64(gapMR.Length()) - n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) + n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) done += n rw.off += n dsts = dsts.DropFirst64(n) @@ -425,11 +443,12 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro // If we have a mmappable host FD (which must be used here to ensure // coherence with memory-mapped I/O), or if InteropModeShared is in effect // (which prevents us from caching file contents), or if the file was - // opened with O_DIRECT, write directly to dentry.handle without locking - // dentry.dataMu. + // opened with O_DIRECT, write directly to dentry.writeHandleLocked() + // without locking dentry.dataMu. rw.d.handleMu.RLock() - if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { - n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off) + h := rw.d.writeHandleLocked() + if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off) rw.off += n rw.d.dataMu.Lock() if rw.off > rw.d.size { @@ -491,7 +510,7 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro // for detecting or avoiding this. gapMR := gap.Range().Intersect(mr) gapSrcs := srcs.TakeFirst64(gapMR.Length()) - n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) + n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) done += n rw.off += n srcs = srcs.DropFirst64(n) @@ -517,7 +536,7 @@ exitLoop: if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ Start: start, End: rw.off, - }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil { + }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil { // We have no idea how many bytes were actually flushed. rw.off = start done = 0 @@ -535,6 +554,7 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error { } d.handleMu.RLock() defer d.handleMu.RUnlock() + h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() // Compute the range of valid bytes (overflow-checked). @@ -548,7 +568,7 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error { return fsutil.SyncDirty(ctx, memmap.MappableRange{ Start: uint64(offset), End: uint64(end), - }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) + }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt) } // Seek implements vfs.FileDescriptionImpl.Seek. @@ -605,24 +625,23 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6 // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *regularFileFD) Sync(ctx context.Context) error { - return fd.dentry().syncSharedHandle(ctx) + return fd.dentry().syncCachedFile(ctx) } -func (d *dentry) syncSharedHandle(ctx context.Context) error { +func (d *dentry) syncCachedFile(ctx context.Context) error { d.handleMu.RLock() defer d.handleMu.RUnlock() - if d.handleWritable { + if h := d.writeHandleLocked(); h.isOpen() { d.dataMu.Lock() // Write dirty cached data to the remote file. - err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) + err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt) d.dataMu.Unlock() if err != nil { return err } } - // Sync the remote file. - return d.handle.sync(ctx) + return d.syncRemoteFileLocked(ctx) } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. @@ -646,7 +665,7 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt return syserror.ENODEV } d.handleMu.RLock() - haveFD := d.handle.fd >= 0 + haveFD := d.hostFD >= 0 d.handleMu.RUnlock() if !haveFD { return syserror.ENODEV @@ -667,7 +686,7 @@ func (d *dentry) mayCachePages() bool { return true } d.handleMu.RLock() - haveFD := d.handle.fd >= 0 + haveFD := d.hostFD >= 0 d.handleMu.RUnlock() return haveFD } @@ -725,7 +744,7 @@ func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, // Translate implements memmap.Mappable.Translate. func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { d.handleMu.RLock() - if d.handle.fd >= 0 && !d.fs.opts.forcePageCache { + if d.hostFD >= 0 && !d.fs.opts.forcePageCache { d.handleMu.RUnlock() mr := optional if d.fs.opts.limitHostFDTranslation { @@ -761,7 +780,8 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab } mf := d.fs.mfp.MemoryFile() - cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt) + h := d.readHandleLocked() + cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, h.readToBlocksAt) var ts []memmap.Translation var translatedEnd uint64 @@ -830,9 +850,12 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error { // Write the cache's contents back to the remote file so that if we have a // host fd after restore, the remote file's contents are coherent. mf := d.fs.mfp.MemoryFile() + d.handleMu.RLock() + defer d.handleMu.RUnlock() + h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { return err } @@ -847,20 +870,23 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error { // Evict implements pgalloc.EvictableMemoryUser.Evict. func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { + mr := memmap.MappableRange{er.Start, er.End} + mf := d.fs.mfp.MemoryFile() d.mapsMu.Lock() defer d.mapsMu.Unlock() + d.handleMu.RLock() + defer d.handleMu.RUnlock() + h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() - mr := memmap.MappableRange{er.Start, er.End} - mf := d.fs.mfp.MemoryFile() // Only allow pages that are no longer memory-mapped to be evicted. for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { mgapMR := mgap.Range().Intersect(mr) if mgapMR.Length() == 0 { continue } - if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) } d.cache.Drop(mgapMR, mf) @@ -872,8 +898,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { // cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef. // // dentryPlatformFile is only used when a host FD representing the remote file -// is available (i.e. dentry.handle.fd >= 0), and that FD is used for -// application memory mappings (i.e. !filesystem.opts.forcePageCache). +// is available (i.e. dentry.hostFD >= 0), and that FD is used for application +// memory mappings (i.e. !filesystem.opts.forcePageCache). type dentryPlatformFile struct { *dentry @@ -881,8 +907,8 @@ type dentryPlatformFile struct { // by dentry.dataMu. fdRefs fsutil.FrameRefSet - // If this dentry represents a regular file, and handle.fd >= 0, - // hostFileMapper caches mappings of handle.fd. + // If this dentry represents a regular file, and dentry.hostFD >= 0, + // hostFileMapper caches mappings of dentry.hostFD. hostFileMapper fsutil.HostFileMapper // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. @@ -906,15 +932,13 @@ func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { // MapInternal implements memmap.File.MapInternal. func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { d.handleMu.RLock() - bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write) - d.handleMu.RUnlock() - return bs, err + defer d.handleMu.RUnlock() + return d.hostFileMapper.MapInternal(fr, int(d.hostFD), at.Write) } // FD implements memmap.File.FD. func (d *dentryPlatformFile) FD() int { d.handleMu.RLock() - fd := d.handle.fd - d.handleMu.RUnlock() - return int(fd) + defer d.handleMu.RUnlock() + return int(d.hostFD) } diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go index d6dbe9092..85d2bee72 100644 --- a/pkg/sentry/fsimpl/gofer/socket.go +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -108,7 +108,7 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect // We don't need the receiver. c.CloseRecv() - c.Release() + c.Release(ctx) return c, nil } @@ -136,8 +136,8 @@ func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFla } // Release implements transport.BoundEndpoint.Release. -func (e *endpoint) Release() { - e.dentry.DecRef() +func (e *endpoint) Release(ctx context.Context) { + e.dentry.DecRef(ctx) } // Passcred implements transport.BoundEndpoint.Passcred. diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 811528982..a6368fdd0 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -17,6 +17,7 @@ package gofer import ( "sync" "sync/atomic" + "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -80,11 +81,11 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *specialFileFD) Release() { +func (fd *specialFileFD) Release(ctx context.Context) { if fd.haveQueue { fdnotifier.RemoveFD(fd.handle.fd) } - fd.handle.close(context.Background()) + fd.handle.close(ctx) fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) fs.syncMu.Lock() delete(fs.specialFileFDs, fd) @@ -279,5 +280,13 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) ( // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *specialFileFD) Sync(ctx context.Context) error { - return fd.dentry().syncSharedHandle(ctx) + // If we have a host FD, fsyncing it is likely to be faster than an fsync + // RPC. + if fd.handle.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + err := syscall.Fsync(int(fd.handle.fd)) + ctx.UninterruptibleSleepFinish(false) + return err + } + return fd.handle.file.fsync(ctx) } diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 0eef4e16e..2cb8191b9 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -47,6 +47,7 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) { now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.atime, now) + atomic.StoreUint32(&d.atimeDirty, 1) d.metadataMu.Unlock() mnt.EndWrite() } @@ -67,6 +68,7 @@ func (d *dentry) touchCMtime() { d.metadataMu.Lock() atomic.StoreInt64(&d.mtime, now) atomic.StoreInt64(&d.ctime, now) + atomic.StoreUint32(&d.mtimeDirty, 1) d.metadataMu.Unlock() } @@ -76,4 +78,5 @@ func (d *dentry) touchCMtimeLocked() { now := d.fs.clock.Now().Nanoseconds() atomic.StoreInt64(&d.mtime, now) atomic.StoreInt64(&d.ctime, now) + atomic.StoreUint32(&d.mtimeDirty, 1) } diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go index b9082a20f..0135e4428 100644 --- a/pkg/sentry/fsimpl/host/control.go +++ b/pkg/sentry/fsimpl/host/control.go @@ -58,7 +58,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (c *scmRights) Release() { +func (c *scmRights) Release(ctx context.Context) { for _, fd := range c.fds { syscall.Close(fd) } diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index c894f2ca0..bf922c566 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -117,7 +117,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) d.Init(i) // i.open will take a reference on d. - defer d.DecRef() + defer d.DecRef(ctx) // For simplicity, fileDescription.offset is set to 0. Technically, we // should only set to 0 on files that are not seekable (sockets, pipes, @@ -168,9 +168,9 @@ type filesystem struct { devMinor uint32 } -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { @@ -431,12 +431,12 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre } // DecRef implements kernfs.Inode. -func (i *inode) DecRef() { - i.AtomicRefCount.DecRefWithDestructor(i.Destroy) +func (i *inode) DecRef(ctx context.Context) { + i.AtomicRefCount.DecRefWithDestructor(ctx, i.Destroy) } // Destroy implements kernfs.Inode. -func (i *inode) Destroy() { +func (i *inode) Destroy(context.Context) { if i.wouldBlock { fdnotifier.RemoveFD(int32(i.hostFD)) } @@ -542,7 +542,7 @@ func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux } // Release implements vfs.FileDescriptionImpl. -func (f *fileDescription) Release() { +func (f *fileDescription) Release(context.Context) { // noop } diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index fd16bd92d..4979dd0a9 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -139,7 +139,7 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable } // Send implements transport.ConnectedEndpoint.Send. -func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -216,7 +216,7 @@ func (c *ConnectedEndpoint) EventUpdate() { } // Recv implements transport.Receiver.Recv. -func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -317,8 +317,8 @@ func (c *ConnectedEndpoint) destroyLocked() { // Release implements transport.ConnectedEndpoint.Release and // transport.Receiver.Release. -func (c *ConnectedEndpoint) Release() { - c.ref.DecRefWithDestructor(func() { +func (c *ConnectedEndpoint) Release(ctx context.Context) { + c.ref.DecRefWithDestructor(ctx, func(context.Context) { c.mu.Lock() c.destroyLocked() c.mu.Unlock() @@ -347,8 +347,8 @@ func (e *SCMConnectedEndpoint) Init() error { // Release implements transport.ConnectedEndpoint.Release and // transport.Receiver.Release. -func (e *SCMConnectedEndpoint) Release() { - e.ref.DecRefWithDestructor(func() { +func (e *SCMConnectedEndpoint) Release(ctx context.Context) { + e.ref.DecRefWithDestructor(ctx, func(context.Context) { e.mu.Lock() if err := syscall.Close(e.fd); err != nil { log.Warningf("Failed to close host fd %d: %v", err) diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 4ee9270cc..d372c60cb 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -67,12 +67,12 @@ func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup { } // Release implements fs.FileOperations.Release. -func (t *TTYFileDescription) Release() { +func (t *TTYFileDescription) Release(ctx context.Context) { t.mu.Lock() t.fgProcessGroup = nil t.mu.Unlock() - t.fileDescription.Release() + t.fileDescription.Release(ctx) } // PRead implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index c6c4472e7..12adf727a 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -122,7 +122,7 @@ func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, of } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DynamicBytesFD) Release() {} +func (fd *DynamicBytesFD) Release(context.Context) {} // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 1d37ccb98..fcee6200a 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -113,7 +113,7 @@ func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *GenericDirectoryFD) Release() {} +func (fd *GenericDirectoryFD) Release(context.Context) {} func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem { return fd.vfsfd.VirtualDentry().Mount().Filesystem() diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 61a36cff9..d7edb6342 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -56,13 +56,13 @@ afterSymlink: return vfsd, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() return vfsd, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } rp.Advance() @@ -77,7 +77,7 @@ afterSymlink: if err != nil { return nil, err } - if err := rp.CheckMount(&next.vfsd); err != nil { + if err := rp.CheckMount(ctx, &next.vfsd); err != nil { return nil, err } // Resolve any symlink at current path component. @@ -88,7 +88,7 @@ afterSymlink: } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef() + targetVD.DecRef(ctx) if err != nil { return nil, err } @@ -116,7 +116,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // Cached dentry exists, revalidate. if !child.inode.Valid(ctx) { delete(parent.children, name) - vfsObj.InvalidateDentry(&child.vfsd) + vfsObj.InvalidateDentry(ctx, &child.vfsd) fs.deferDecRef(&child.vfsd) // Reference from Lookup. child = nil } @@ -234,7 +234,7 @@ func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Den } // Release implements vfs.FilesystemImpl.Release. -func (fs *Filesystem) Release() { +func (fs *Filesystem) Release(context.Context) { } // Sync implements vfs.FilesystemImpl.Sync. @@ -246,7 +246,7 @@ func (fs *Filesystem) Sync(ctx context.Context) error { // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() _, inode, err := fs.walkExistingLocked(ctx, rp) @@ -259,7 +259,7 @@ func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) if err != nil { @@ -282,7 +282,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() vfsd, _, err := fs.walkParentDirLocked(ctx, rp) if err != nil { @@ -300,7 +300,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -337,7 +337,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -365,7 +365,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -397,7 +397,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // Do not create new file. if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) if err != nil { @@ -429,7 +429,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf } afterTrailingSymlink: parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return nil, err } @@ -483,7 +483,7 @@ afterTrailingSymlink: } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef() + targetVD.DecRef(ctx) if err != nil { return nil, err } @@ -507,7 +507,7 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st fs.mu.RLock() d, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return "", err } @@ -526,7 +526,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 fs.mu.Lock() - defer fs.processDeferredDecRefsLocked() + defer fs.processDeferredDecRefsLocked(ctx) defer fs.mu.Unlock() // Resolve the destination directory first to verify that it's on this @@ -584,7 +584,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) virtfs := rp.VirtualFilesystem() // We can't deadlock here due to lock ordering because we're protected from @@ -615,7 +615,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa dstDir.children = make(map[string]*Dentry) } dstDir.children[pc] = src - virtfs.CommitRenameReplaceDentry(srcVFSD, replaced) + virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced) return nil } @@ -624,7 +624,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error fs.mu.Lock() defer fs.mu.Unlock() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -648,7 +648,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } @@ -656,7 +656,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error virtfs.AbortDeleteDentry(vfsd) return err } - virtfs.CommitDeleteDentry(vfsd) + virtfs.CommitDeleteDentry(ctx, vfsd) return nil } @@ -665,7 +665,7 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.mu.RLock() _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return err } @@ -680,7 +680,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf fs.mu.RLock() _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return linux.Statx{}, err } @@ -692,7 +692,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return linux.Statfs{}, err } @@ -708,7 +708,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -733,7 +733,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error fs.mu.Lock() defer fs.mu.Unlock() vfsd, _, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -753,7 +753,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } @@ -761,7 +761,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error virtfs.AbortDeleteDentry(vfsd) return err } - virtfs.CommitDeleteDentry(vfsd) + virtfs.CommitDeleteDentry(ctx, vfsd) return nil } @@ -770,7 +770,7 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath fs.mu.RLock() _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return nil, err } @@ -785,7 +785,7 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return nil, err } @@ -798,7 +798,7 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return "", err } @@ -811,7 +811,7 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return err } @@ -824,7 +824,7 @@ func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return err } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 579e627f0..c3efcf3ec 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -40,7 +40,7 @@ func (InodeNoopRefCount) IncRef() { } // DecRef implements Inode.DecRef. -func (InodeNoopRefCount) DecRef() { +func (InodeNoopRefCount) DecRef(context.Context) { } // TryIncRef implements Inode.TryIncRef. @@ -49,7 +49,7 @@ func (InodeNoopRefCount) TryIncRef() bool { } // Destroy implements Inode.Destroy. -func (InodeNoopRefCount) Destroy() { +func (InodeNoopRefCount) Destroy(context.Context) { } // InodeDirectoryNoNewChildren partially implements the Inode interface. @@ -366,12 +366,12 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { } // DecRef implements Inode.DecRef. -func (o *OrderedChildren) DecRef() { - o.AtomicRefCount.DecRefWithDestructor(o.Destroy) +func (o *OrderedChildren) DecRef(ctx context.Context) { + o.AtomicRefCount.DecRefWithDestructor(ctx, o.Destroy) } // Destroy cleans up resources referenced by this OrderedChildren. -func (o *OrderedChildren) Destroy() { +func (o *OrderedChildren) Destroy(context.Context) { o.mu.Lock() defer o.mu.Unlock() o.order.Reset() diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 46f207664..080118841 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -116,17 +116,17 @@ func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the // droppedDentries list. See comment on Filesystem.mu. -func (fs *Filesystem) processDeferredDecRefs() { +func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) { fs.mu.Lock() - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) fs.mu.Unlock() } // Precondition: fs.mu must be held for writing. -func (fs *Filesystem) processDeferredDecRefsLocked() { +func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) { fs.droppedDentriesMu.Lock() for _, d := range fs.droppedDentries { - d.DecRef() + d.DecRef(ctx) } fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. fs.droppedDentriesMu.Unlock() @@ -212,16 +212,16 @@ func (d *Dentry) isSymlink() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *Dentry) DecRef() { - d.AtomicRefCount.DecRefWithDestructor(d.destroy) +func (d *Dentry) DecRef(ctx context.Context) { + d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy) } // Precondition: Dentry must be removed from VFS' dentry cache. -func (d *Dentry) destroy() { - d.inode.DecRef() // IncRef from Init. +func (d *Dentry) destroy(ctx context.Context) { + d.inode.DecRef(ctx) // IncRef from Init. d.inode = nil if d.parent != nil { - d.parent.DecRef() // IncRef from Dentry.InsertChild. + d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild. } } @@ -230,7 +230,7 @@ func (d *Dentry) destroy() { // Although Linux technically supports inotify on pseudo filesystems (inotify // is implemented at the vfs layer), it is not particularly useful. It is left // unimplemented until someone actually needs it. -func (d *Dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {} +func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. func (d *Dentry) Watches() *vfs.Watches { @@ -238,7 +238,7 @@ func (d *Dentry) Watches() *vfs.Watches { } // OnZeroWatches implements vfs.Dentry.OnZeroWatches. -func (d *Dentry) OnZeroWatches() {} +func (d *Dentry) OnZeroWatches(context.Context) {} // InsertChild inserts child into the vfs dentry cache with the given name under // this dentry. This does not update the directory inode, so calling this on @@ -326,12 +326,12 @@ type Inode interface { type inodeRefs interface { IncRef() - DecRef() + DecRef(ctx context.Context) TryIncRef() bool // Destroy is called when the inode reaches zero references. Destroy release // all resources (references) on objects referenced by the inode, including // any child dentries. - Destroy() + Destroy(ctx context.Context) } type inodeMetadata interface { diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index dc407eb1d..c5d5afedf 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -46,7 +46,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) v := &vfs.VirtualFilesystem{} - if err := v.Init(); err != nil { + if err := v.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ @@ -163,7 +163,7 @@ func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (* dir := d.fs.newDir(creds, opts.Mode, nil) dirVFSD := dir.VFSDentry() if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil { - dir.DecRef() + dir.DecRef(ctx) return nil, err } d.IncLinks(1) @@ -175,7 +175,7 @@ func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (* f := d.fs.newFile(creds, "") fVFSD := f.VFSDentry() if err := d.OrderedChildren.Insert(name, fVFSD); err != nil { - f.DecRef() + f.DecRef(ctx) return nil, err } return fVFSD, nil @@ -213,7 +213,7 @@ func TestBasic(t *testing.T) { }) }) defer sys.Destroy() - sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef() + sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef(sys.Ctx) } func TestMkdirGetDentry(t *testing.T) { @@ -228,7 +228,7 @@ func TestMkdirGetDentry(t *testing.T) { if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err) } - sys.GetDentryOrDie(pop).DecRef() + sys.GetDentryOrDie(pop).DecRef(sys.Ctx) } func TestReadStaticFile(t *testing.T) { @@ -246,7 +246,7 @@ func TestReadStaticFile(t *testing.T) { if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(sys.Ctx) content, err := sys.ReadToEnd(fd) if err != nil { @@ -273,7 +273,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) { } // Close the file. The file should persist. - fd.DecRef() + fd.DecRef(sys.Ctx) fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{ Flags: linux.O_RDONLY, @@ -281,7 +281,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) { if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } - fd.DecRef() + fd.DecRef(sys.Ctx) } func TestDirFDReadWrite(t *testing.T) { @@ -297,7 +297,7 @@ func TestDirFDReadWrite(t *testing.T) { if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(sys.Ctx) // Read/Write should fail for directory FDs. if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 8f8dcfafe..b3d19ff82 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -98,7 +98,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { if err != nil { return err } - defer oldFD.DecRef() + defer oldFD.DecRef(ctx) newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{ Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL, Mode: linux.FileMode(d.mode &^ linux.S_IFMT), @@ -106,7 +106,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { if err != nil { return err } - defer newFD.DecRef() + defer newFD.DecRef(ctx) bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size for { readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{}) @@ -241,13 +241,13 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { Mask: linux.STATX_INO, }) if err != nil { - d.upperVD.DecRef() + d.upperVD.DecRef(ctx) d.upperVD = vfs.VirtualDentry{} cleanupUndoCopyUp() return err } if upperStat.Mask&linux.STATX_INO == 0 { - d.upperVD.DecRef() + d.upperVD.DecRef(ctx) d.upperVD = vfs.VirtualDentry{} cleanupUndoCopyUp() return syserror.EREMOTE diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go index f5c2462a5..fccb94105 100644 --- a/pkg/sentry/fsimpl/overlay/directory.go +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -46,7 +46,7 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string readdirErr = err return false } - defer layerFD.DecRef() + defer layerFD.DecRef(ctx) // Reuse slice allocated for maybeWhiteouts from a previous layer to // reduce allocations. @@ -108,7 +108,7 @@ type directoryFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(ctx context.Context) { } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. @@ -177,7 +177,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { readdirErr = err return false } - defer layerFD.DecRef() + defer layerFD.DecRef(ctx) // Reuse slice allocated for maybeWhiteouts from a previous layer to // reduce allocations. @@ -282,6 +282,6 @@ func (fd *directoryFD) Sync(ctx context.Context) error { return err } err = upperFD.Sync(ctx) - upperFD.DecRef() + upperFD.DecRef(ctx) return err } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index 6b705e955..986b36ead 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -77,7 +77,7 @@ func putDentrySlice(ds *[]*dentry) { // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. -func (fs *filesystem) renameMuRUnlockAndCheckDrop(ds **[]*dentry) { +func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { return @@ -85,20 +85,20 @@ func (fs *filesystem) renameMuRUnlockAndCheckDrop(ds **[]*dentry) { if len(**ds) != 0 { fs.renameMu.Lock() for _, d := range **ds { - d.checkDropLocked() + d.checkDropLocked(ctx) } fs.renameMu.Unlock() } putDentrySlice(*ds) } -func (fs *filesystem) renameMuUnlockAndCheckDrop(ds **[]*dentry) { +func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() return } for _, d := range **ds { - d.checkDropLocked() + d.checkDropLocked(ctx) } fs.renameMu.Unlock() putDentrySlice(*ds) @@ -126,13 +126,13 @@ afterSymlink: return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() return d, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } rp.Advance() @@ -142,7 +142,7 @@ afterSymlink: if err != nil { return nil, err } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { @@ -272,11 +272,11 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str }) if lookupErr != nil { - child.destroyLocked() + child.destroyLocked(ctx) return nil, lookupErr } if !existsOnAnyLayer { - child.destroyLocked() + child.destroyLocked(ctx) return nil, syserror.ENOENT } @@ -430,7 +430,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -501,7 +501,7 @@ func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.V func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -513,7 +513,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -532,7 +532,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -553,7 +553,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -720,7 +720,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) if rp.Done() { @@ -825,7 +825,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf fd.LockFD.Init(&d.locks) layerFDOpts := layerFD.Options() if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil { - layerFD.DecRef() + layerFD.DecRef(ctx) return nil, err } return &fd.vfsfd, nil @@ -920,7 +920,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving fd.LockFD.Init(&child.locks) upperFDOpts := upperFD.Options() if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil { - upperFD.DecRef() + upperFD.DecRef(ctx) // Don't bother with cleanup; the file was created successfully, we // just can't open it anymore for some reason. return nil, err @@ -932,7 +932,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -952,7 +952,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa var ds *[]*dentry fs.renameMu.Lock() - defer fs.renameMuUnlockAndCheckDrop(&ds) + defer fs.renameMuUnlockAndCheckDrop(ctx, &ds) newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) if err != nil { return err @@ -979,7 +979,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -1001,7 +1001,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() @@ -1086,7 +1086,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } - vfsObj.CommitDeleteDentry(&child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) delete(parent.children, name) ds = appendDentry(ds, child) parent.dirents = nil @@ -1097,7 +1097,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -1132,7 +1132,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err @@ -1160,7 +1160,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statfs{}, err @@ -1211,7 +1211,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -1233,7 +1233,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() @@ -1298,7 +1298,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } if child != nil { - vfsObj.CommitDeleteDentry(&child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) delete(parent.children, name) ds = appendDentry(ds, child) } @@ -1310,7 +1310,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1324,7 +1324,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1336,7 +1336,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -1348,7 +1348,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go index c0749e711..d3060a481 100644 --- a/pkg/sentry/fsimpl/overlay/non_directory.go +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -81,11 +81,11 @@ func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescrip oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR) if oldOffErr == nil { if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil { - upperFD.DecRef() + upperFD.DecRef(ctx) return nil, err } } - fd.cachedFD.DecRef() + fd.cachedFD.DecRef(ctx) fd.copiedUp = true fd.cachedFD = upperFD fd.cachedFlags = statusFlags @@ -99,8 +99,8 @@ func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescrip } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *nonDirectoryFD) Release() { - fd.cachedFD.DecRef() +func (fd *nonDirectoryFD) Release(ctx context.Context) { + fd.cachedFD.DecRef(ctx) fd.cachedFD = nil } @@ -138,7 +138,7 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux Mask: layerMask, Sync: opts.Sync, }) - wrappedFD.DecRef() + wrappedFD.DecRef(ctx) if err != nil { return linux.Statx{}, err } @@ -187,7 +187,7 @@ func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, off if err != nil { return 0, err } - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) return wrappedFD.PRead(ctx, dst, offset, opts) } @@ -209,7 +209,7 @@ func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, of if err != nil { return 0, err } - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) return wrappedFD.PWrite(ctx, src, offset, opts) } @@ -250,7 +250,7 @@ func (fd *nonDirectoryFD) Sync(ctx context.Context) error { return err } wrappedFD.IncRef() - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) fd.mu.Unlock() return wrappedFD.Sync(ctx) } @@ -261,6 +261,6 @@ func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOp if err != nil { return err } - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) return wrappedFD.ConfigureMMap(ctx, opts) } diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index e720d4825..75cc006bf 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -123,7 +123,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // filesystem with any number of lower layers. } else { vfsroot := vfs.RootFromContext(ctx) - defer vfsroot.DecRef() + defer vfsroot.DecRef(ctx) upperPathname, ok := mopts["upperdir"] if ok { delete(mopts, "upperdir") @@ -147,13 +147,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) return nil, nil, err } - defer upperRoot.DecRef() + defer upperRoot.DecRef(ctx) privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) if err != nil { ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) return nil, nil, err } - defer privateUpperRoot.DecRef() + defer privateUpperRoot.DecRef(ctx) fsopts.UpperRoot = privateUpperRoot } lowerPathnamesStr, ok := mopts["lowerdir"] @@ -190,13 +190,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) return nil, nil, err } - defer lowerRoot.DecRef() + defer lowerRoot.DecRef(ctx) privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) if err != nil { ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) return nil, nil, err } - defer privateLowerRoot.DecRef() + defer privateLowerRoot.DecRef(ctx) fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot) } } @@ -264,19 +264,19 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt Mask: rootStatMask, }) if err != nil { - root.destroyLocked() - fs.vfsfs.DecRef() + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) return nil, nil, err } if rootStat.Mask&rootStatMask != rootStatMask { - root.destroyLocked() - fs.vfsfs.DecRef() + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EREMOTE } if isWhiteout(&rootStat) { ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") - root.destroyLocked() - fs.vfsfs.DecRef() + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL } root.mode = uint32(rootStat.Mode) @@ -319,17 +319,17 @@ func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forc } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { vfsObj := fs.vfsfs.VirtualFilesystem() vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor) for _, lowerDevMinor := range fs.lowerDevMinors { vfsObj.PutAnonBlockDevMinor(lowerDevMinor) } if fs.opts.UpperRoot.Ok() { - fs.opts.UpperRoot.DecRef() + fs.opts.UpperRoot.DecRef(ctx) } for _, lowerRoot := range fs.opts.LowerRoots { - lowerRoot.DecRef() + lowerRoot.DecRef(ctx) } } @@ -452,10 +452,10 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { +func (d *dentry) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { d.fs.renameMu.Lock() - d.checkDropLocked() + d.checkDropLocked(ctx) d.fs.renameMu.Unlock() } else if refs < 0 { panic("overlay.dentry.DecRef() called without holding a reference") @@ -466,7 +466,7 @@ func (d *dentry) DecRef() { // becomes deleted. // // Preconditions: d.fs.renameMu must be locked for writing. -func (d *dentry) checkDropLocked() { +func (d *dentry) checkDropLocked(ctx context.Context) { // Dentries with a positive reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will @@ -476,14 +476,14 @@ func (d *dentry) checkDropLocked() { return } // Refs is still zero; destroy it. - d.destroyLocked() + d.destroyLocked(ctx) return } // destroyLocked destroys the dentry. // // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. -func (d *dentry) destroyLocked() { +func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: // Mark the dentry destroyed. @@ -495,10 +495,10 @@ func (d *dentry) destroyLocked() { } if d.upperVD.Ok() { - d.upperVD.DecRef() + d.upperVD.DecRef(ctx) } for _, lowerVD := range d.lowerVDs { - lowerVD.DecRef() + lowerVD.DecRef(ctx) } if d.parent != nil { @@ -510,7 +510,7 @@ func (d *dentry) destroyLocked() { // Drop the reference held by d on its parent without recursively // locking d.fs.renameMu. if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { - d.parent.checkDropLocked() + d.parent.checkDropLocked(ctx) } else if refs < 0 { panic("overlay.dentry.DecRef() called without holding a reference") } @@ -518,7 +518,7 @@ func (d *dentry) destroyLocked() { } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) { +func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) { // TODO(gvisor.dev/issue/1479): Implement inotify. } @@ -531,7 +531,7 @@ func (d *dentry) Watches() *vfs.Watches { // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) OnZeroWatches() {} +func (d *dentry) OnZeroWatches(context.Context) {} // iterLayers invokes yield on each layer comprising d, from top to bottom. If // any call to yield returns false, iterLayer stops iteration. diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 811f80a5f..2ca793db9 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -63,9 +63,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // PrependPath implements vfs.FilesystemImpl.PrependPath. @@ -160,6 +160,6 @@ func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vf inode := newInode(ctx, fs) var d kernfs.Dentry d.Init(inode) - defer d.DecRef() + defer d.DecRef(ctx) return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags) } diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 609210253..2463d51cd 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -77,9 +77,9 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // dynamicInode is an overfitted interface for common Inodes with diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index fea29e5f0..f0d3f7f5e 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -43,12 +43,12 @@ func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) return file, flags } -func taskFDExists(t *kernel.Task, fd int32) bool { +func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool { file, _ := getTaskFD(t, fd) if file == nil { return false } - file.DecRef() + file.DecRef(ctx) return true } @@ -68,7 +68,7 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off var fds []int32 i.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.GetFDs() + fds = fdTable.GetFDs(ctx) } }) @@ -135,7 +135,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(i.task, fd) { + if !taskFDExists(ctx, i.task, fd) { return nil, syserror.ENOENT } taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()) @@ -204,9 +204,9 @@ func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { if file == nil { return "", syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) root := vfs.RootFromContext(ctx) - defer root.DecRef() + defer root.DecRef(ctx) return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) } @@ -215,7 +215,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen if file == nil { return vfs.VirtualDentry{}, "", syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) vd := file.VirtualDentry() vd.IncRef() return vd, "", nil @@ -258,7 +258,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(i.task, fd) { + if !taskFDExists(ctx, i.task, fd) { return nil, syserror.ENOENT } data := &fdInfoData{ @@ -297,7 +297,7 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { if file == nil { return syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) // TODO(b/121266871): Include pos, locks, and other data. For now we only // have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 859b7d727..830b78949 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -677,7 +677,7 @@ func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { if err != nil { return "", err } - defer exec.DecRef() + defer exec.DecRef(ctx) return exec.PathnameWithDeleted(ctx), nil } @@ -692,7 +692,7 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent if err != nil { return vfs.VirtualDentry{}, "", err } - defer exec.DecRef() + defer exec.DecRef(ctx) vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() vd.IncRef() @@ -748,7 +748,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef() + defer rootDir.DecRef(ctx) i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) return nil } @@ -779,7 +779,7 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef() + defer rootDir.DecRef(ctx) i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } @@ -825,7 +825,7 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir dentry.Init(&namespaceInode{}) vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) vd.IncRef() - dentry.DecRef() + dentry.DecRef(ctx) return vd, "", nil } @@ -887,8 +887,8 @@ func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) err } // Release implements FileDescriptionImpl. -func (fd *namespaceFD) Release() { - fd.inode.DecRef() +func (fd *namespaceFD) Release(ctx context.Context) { + fd.inode.DecRef(ctx) } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 6bde27376..a4c884bf9 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -212,7 +212,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { continue } if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX { - s.DecRef() + s.DecRef(ctx) // Not a unix socket. continue } @@ -281,7 +281,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { } fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil } @@ -359,7 +359,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { - s.DecRef() + s.DecRef(ctx) // Not tcp4 sockets. continue } @@ -455,7 +455,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil @@ -524,7 +524,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { - s.DecRef() + s.DecRef(ctx) // Not udp4 socket. continue } @@ -600,7 +600,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil } diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 6dac2afa4..b71778128 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -55,7 +55,8 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke if stack := k.RootNetworkNamespace().Stack(); stack != nil { contents = map[string]*kernfs.Dentry{ "ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), + "tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}), + "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the @@ -207,3 +208,49 @@ func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset *d.enabled = v != 0 return n, d.stack.SetTCPSACKEnabled(*d.enabled) } + +// tcpRecoveryData implements vfs.WritableDynamicBytesSource for +// /proc/sys/net/ipv4/tcp_recovery. +// +// +stateify savable +type tcpRecoveryData struct { + kernfs.DynamicBytesFile + + stack inet.Stack `state:"wait"` +} + +var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil) + +// Generate implements vfs.DynamicBytesSource. +func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error { + recovery, err := d.stack.TCPRecovery() + if err != nil { + return err + } + + buf.WriteString(fmt.Sprintf("%d\n", recovery)) + return nil +} + +func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + // No need to handle partial writes thus far. + return 0, syserror.EINVAL + } + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit the amount of memory allocated. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil { + return 0, err + } + return n, nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 19abb5034..3c9297dee 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -218,7 +218,7 @@ func TestTasks(t *testing.T) { if err != nil { t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) buf := make([]byte, 1) bufIOSeq := usermem.BytesIOSequence(buf) if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { @@ -336,7 +336,7 @@ func TestTasksOffset(t *testing.T) { if err != nil { t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil { t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err) } @@ -441,7 +441,7 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err) continue } - defer child.DecRef() + defer child.DecRef(ctx) stat, err := child.Stat(ctx, vfs.StatOptions{}) if err != nil { t.Errorf("Stat(%v) failed: %v", absPath, err) @@ -476,7 +476,7 @@ func TestTree(t *testing.T) { if err != nil { t.Fatalf("failed to create test file: %v", err) } - defer file.DecRef() + defer file.DecRef(s.Ctx) var tasks []*kernel.Task for i := 0; i < 5; i++ { @@ -501,5 +501,5 @@ func TestTree(t *testing.T) { t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err) } iterateDir(ctx, t, s, fd) - fd.DecRef() + fd.DecRef(ctx) } diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index 242ba9b5d..6297e1df4 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -54,7 +54,7 @@ var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil) // New creates a new signal fd. func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[signalfd]") - defer vd.DecRef() + defer vd.DecRef(target) sfd := &SignalFileDescription{ target: target, mask: mask, @@ -133,4 +133,4 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) { } // Release implements FileDescriptionImpl.Release() -func (sfd *SignalFileDescription) Release() {} +func (sfd *SignalFileDescription) Release(context.Context) {} diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index ee0828a15..c61818ff6 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -67,9 +67,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 01ce30a4d..0401726b6 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -30,6 +30,7 @@ import ( // Name is the default filesystem name. const Name = "sysfs" +const defaultSysDirMode = linux.FileMode(0755) // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} @@ -57,9 +58,6 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt devMinor: devMinor, } fs.VFSFilesystem().Init(vfsObj, &fsType, fs) - k := kernel.KernelFromContext(ctx) - maxCPUCores := k.ApplicationCores() - defaultSysDirMode := linux.FileMode(0755) root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ "block": fs.newDir(creds, defaultSysDirMode, nil), @@ -70,11 +68,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt "dev": fs.newDir(creds, defaultSysDirMode, nil), "devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ "system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ - "cpu": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ - "online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), - "possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), - "present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), - }), + "cpu": cpuDir(ctx, fs, creds), }), }), "firmware": fs.newDir(creds, defaultSysDirMode, nil), @@ -86,10 +80,24 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.VFSFilesystem(), root.VFSDentry(), nil } +func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry { + k := kernel.KernelFromContext(ctx) + maxCPUCores := k.ApplicationCores() + children := map[string]*kernfs.Dentry{ + "online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + "possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + "present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + } + for i := uint(0); i < maxCPUCores; i++ { + children[fmt.Sprintf("cpu%d", i)] = fs.newDir(creds, linux.FileMode(0555), nil) + } + return fs.newDir(creds, defaultSysDirMode, children) +} + // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // dir implements kernfs.Inode. diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 242d5fd12..9fd38b295 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -59,7 +59,7 @@ func TestReadCPUFile(t *testing.T) { if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) content, err := s.ReadToEnd(fd) if err != nil { t.Fatalf("Read failed: %v", err) diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index e743e8114..1813269e0 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -122,12 +122,16 @@ func Boot() (*kernel.Kernel, error) { // CreateTask creates a new bare bones task for tests. func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) { k := kernel.KernelFromContext(ctx) + if k == nil { + return nil, fmt.Errorf("cannot find kernel from context") + } + exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root) if err != nil { return nil, err } m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) - m.SetExecutable(fsbridge.NewVFSFile(exe)) + m.SetExecutable(ctx, fsbridge.NewVFSFile(exe)) config := &kernel.TaskConfig{ Kernel: k, diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 0556af877..568132121 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -97,8 +97,8 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { // Destroy release resources associated with a test system. func (s *System) Destroy() { - s.Root.DecRef() - s.MntNs.DecRef() // Reference on MntNs passed to NewSystem. + s.Root.DecRef(s.Ctx) + s.MntNs.DecRef(s.Ctx) // Reference on MntNs passed to NewSystem. } // ReadToEnd reads the contents of fd until EOF to a string. @@ -149,7 +149,7 @@ func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector { if err != nil { s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) collector := &DirentCollector{} if err := fd.IterDirents(s.Ctx, collector); err != nil { diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index 2dc90d484..86beaa0a8 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -47,9 +47,9 @@ var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil) var _ ktime.TimerListener = (*TimerFileDescription)(nil) // New returns a new timer fd. -func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { +func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[timerfd]") - defer vd.DecRef() + defer vd.DecRef(ctx) tfd := &TimerFileDescription{} tfd.timer = ktime.NewTimer(clock, tfd) if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ @@ -129,7 +129,7 @@ func (tfd *TimerFileDescription) ResumeTimer() { } // Release implements FileDescriptionImpl.Release() -func (tfd *TimerFileDescription) Release() { +func (tfd *TimerFileDescription) Release(context.Context) { tfd.timer.Destroy() } diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 2fb5c4d84..d263147c2 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -83,7 +83,7 @@ func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent } err = fn(root, d) - d.DecRef() + d.DecRef(ctx) return err } @@ -105,17 +105,17 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create mount namespace: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create nested directories with given depth. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) d := root d.IncRef() - defer d.DecRef() + defer d.DecRef(ctx) for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { @@ -125,7 +125,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - d.DecRef() + d.DecRef(ctx) d = next filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -136,7 +136,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - file.DecRef() + file.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -176,7 +176,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { // Create VFS. vfsObj := vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { b.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -186,14 +186,14 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create nested directories with given depth. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) vd := root vd.IncRef() for i := depth; i > 0; i-- { @@ -212,7 +212,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - vd.DecRef() + vd.DecRef(ctx) vd = nextVD filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -228,12 +228,12 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) - vd.DecRef() + vd.DecRef(ctx) vd = vfs.VirtualDentry{} if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - defer fd.DecRef() + defer fd.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -278,14 +278,14 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create mount namespace: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create and mount the submount. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil { b.Fatalf("failed to create mount point: %v", err) } @@ -293,7 +293,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) if err != nil { b.Fatalf("failed to create tmpfs submount: %v", err) @@ -309,7 +309,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount root: %v", err) } - defer d.DecRef() + defer d.DecRef(ctx) for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { @@ -319,7 +319,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - d.DecRef() + d.DecRef(ctx) d = next filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -330,7 +330,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - file.DecRef() + file.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -370,7 +370,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { // Create VFS. vfsObj := vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { b.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -380,14 +380,14 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create the mount point. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, @@ -403,7 +403,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) // Create and mount the submount. if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) @@ -432,7 +432,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - vd.DecRef() + vd.DecRef(ctx) vd = nextVD filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -448,11 +448,11 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) - vd.DecRef() + vd.DecRef(ctx) if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - fd.DecRef() + fd.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 0a1ad4765..78b4fc5be 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -95,7 +95,7 @@ type directoryFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(ctx context.Context) { if fd.iter != nil { dir := fd.inode().impl.(*directory) dir.iterMu.Lock() @@ -110,7 +110,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fs := fd.filesystem() dir := fd.inode().impl.(*directory) - defer fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + defer fd.dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent) // fs.mu is required to read d.parent and dentry.name. fs.mu.RLock() diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index ef210a69b..065812065 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -40,7 +40,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // stepLocked is loosely analogous to fs/namei.c:walk_component(). // // Preconditions: filesystem.mu must be locked. !rp.Done(). -func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { +func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { dir, ok := d.inode.impl.(*directory) if !ok { return nil, syserror.ENOTDIR @@ -55,13 +55,13 @@ afterSymlink: return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() return d, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } rp.Advance() @@ -74,7 +74,7 @@ afterSymlink: if !ok { return nil, syserror.ENOENT } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { @@ -98,9 +98,9 @@ afterSymlink: // fs/namei.c:path_parentat(). // // Preconditions: filesystem.mu must be locked. !rp.Done(). -func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { +func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) { for !rp.Final() { - next, err := stepLocked(rp, d) + next, err := stepLocked(ctx, rp, d) if err != nil { return nil, err } @@ -118,10 +118,10 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { // resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). // // Preconditions: filesystem.mu must be locked. -func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { +func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) { d := rp.Start().Impl().(*dentry) for !rp.Done() { - next, err := stepLocked(rp, d) + next, err := stepLocked(ctx, rp, d) if err != nil { return nil, err } @@ -141,10 +141,10 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { // // Preconditions: !rp.Done(). For the final path component in rp, // !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -182,7 +182,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if dir { ev |= linux.IN_ISDIR } - parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) parentDir.inode.touchCMtime() return nil } @@ -191,7 +191,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return err } @@ -202,7 +202,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -222,7 +222,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return nil, err } @@ -232,7 +232,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { if rp.Mount() != vd.Mount() { return syserror.EXDEV } @@ -251,7 +251,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } i.incLinksLocked() - i.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) + i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) parentDir.insertChildLocked(fs.newDentry(i), name) return nil }) @@ -259,7 +259,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() if parentDir.inode.nlink == maxLinks { return syserror.EMLINK @@ -273,7 +273,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() var childInode *inode switch opts.Mode.FileType() { @@ -308,7 +308,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -330,7 +330,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf return start.open(ctx, rp, &opts, false /* afterCreate */) } afterTrailingSymlink: - parentDir, err := walkParentDirLocked(rp, start) + parentDir, err := walkParentDirLocked(ctx, rp, start) if err != nil { return nil, err } @@ -368,7 +368,7 @@ afterTrailingSymlink: if err != nil { return nil, err } - parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) + parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) parentDir.inode.touchCMtime() return fd, nil } @@ -376,7 +376,7 @@ afterTrailingSymlink: return nil, syserror.EEXIST } // Is the file mounted over? - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } // Do we need to resolve a trailing symlink? @@ -445,7 +445,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return "", err } @@ -467,7 +467,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Resolve newParent first to verify that it's on this Mount. fs.mu.Lock() defer fs.mu.Unlock() - newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -555,7 +555,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) var replacedVFSD *vfs.Dentry if replaced != nil { replacedVFSD = &replaced.vfsd @@ -566,17 +566,19 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if replaced != nil { newParentDir.removeChildLocked(replaced) if replaced.inode.isDir() { - newParentDir.inode.decLinksLocked() // from replaced's ".." + // Remove links for replaced/. and replaced/.. + replaced.inode.decLinksLocked(ctx) + newParentDir.inode.decLinksLocked(ctx) } - replaced.inode.decLinksLocked() + replaced.inode.decLinksLocked(ctx) } oldParentDir.removeChildLocked(renamed) newParentDir.insertChildLocked(renamed, newName) - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) oldParentDir.inode.touchCMtime() if oldParentDir != newParentDir { if renamed.inode.isDir() { - oldParentDir.inode.decLinksLocked() + oldParentDir.inode.decLinksLocked(ctx) newParentDir.inode.incLinksLocked() } newParentDir.inode.touchCMtime() @@ -591,7 +593,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -626,17 +628,17 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } parentDir.removeChildLocked(child) - parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) + parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) // Remove links for child, child/., and child/.. - child.inode.decLinksLocked() - child.inode.decLinksLocked() - parentDir.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(&child.vfsd) + child.inode.decLinksLocked(ctx) + child.inode.decLinksLocked(ctx) + parentDir.inode.decLinksLocked(ctx) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) parentDir.inode.touchCMtime() return nil } @@ -644,7 +646,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err @@ -656,7 +658,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.mu.RUnlock() if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -665,7 +667,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return linux.Statx{}, err } @@ -678,7 +680,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() defer fs.mu.RUnlock() - if _, err := resolveLocked(rp); err != nil { + if _, err := resolveLocked(ctx, rp); err != nil { return linux.Statfs{}, err } statfs := linux.Statfs{ @@ -695,7 +697,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target)) parentDir.insertChildLocked(child, name) @@ -707,7 +709,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -738,7 +740,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } @@ -746,11 +748,11 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error // Generate inotify events. Note that this must take place before the link // count of the child is decremented, or else the watches may be dropped // before these events are added. - vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name) + vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name) parentDir.removeChildLocked(child) - child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(&child.vfsd) + child.inode.decLinksLocked(ctx) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) parentDir.inode.touchCMtime() return nil } @@ -759,7 +761,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -778,7 +780,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -789,7 +791,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return "", err } @@ -799,7 +801,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { fs.mu.RLock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err @@ -810,14 +812,14 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt } fs.mu.RUnlock() - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err @@ -828,7 +830,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, } fs.mu.RUnlock() - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 1614f2c39..ec2701d8b 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -32,7 +32,7 @@ const fileName = "mypipe" func TestSeparateFDs(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the read side. This is done in a concurrently because opening // One end the pipe blocks until the other end is opened. @@ -55,13 +55,13 @@ func TestSeparateFDs(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer wfd.DecRef() + defer wfd.DecRef(ctx) rfd, ok := <-rfdchan if !ok { t.Fatalf("failed to open pipe for reading %q", fileName) } - defer rfd.DecRef() + defer rfd.DecRef(ctx) const msg = "vamos azul" checkEmpty(ctx, t, rfd) @@ -71,7 +71,7 @@ func TestSeparateFDs(t *testing.T) { func TestNonblockingRead(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the read side as nonblocking. pop := vfs.PathOperation{ @@ -85,7 +85,7 @@ func TestNonblockingRead(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for reading %q: %v", fileName, err) } - defer rfd.DecRef() + defer rfd.DecRef(ctx) // Open the write side. openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY} @@ -93,7 +93,7 @@ func TestNonblockingRead(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer wfd.DecRef() + defer wfd.DecRef(ctx) const msg = "geh blau" checkEmpty(ctx, t, rfd) @@ -103,7 +103,7 @@ func TestNonblockingRead(t *testing.T) { func TestNonblockingWriteError(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the write side as nonblocking, which should return ENXIO. pop := vfs.PathOperation{ @@ -121,7 +121,7 @@ func TestNonblockingWriteError(t *testing.T) { func TestSingleFD(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the pipe as readable and writable. pop := vfs.PathOperation{ @@ -135,7 +135,7 @@ func TestSingleFD(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer fd.DecRef() + defer fd.DecRef(ctx) const msg = "forza blu" checkEmpty(ctx, t, fd) @@ -152,7 +152,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index abbaa5d60..0710b65db 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -270,7 +270,7 @@ type regularFileFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() { +func (fd *regularFileFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 2545d88e9..4681a2f52 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -185,7 +185,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt case linux.S_IFDIR: root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry default: - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } return &fs.vfsfs, &root.vfsd, nil @@ -197,7 +197,7 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } @@ -249,12 +249,12 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { - d.inode.decRef() +func (d *dentry) DecRef(ctx context.Context) { + d.inode.decRef(ctx) } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { if d.inode.isDir() { events |= linux.IN_ISDIR } @@ -266,9 +266,9 @@ func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { d.inode.fs.mu.RLock() // The ordering below is important, Linux always notifies the parent first. if d.parent != nil { - d.parent.inode.watches.Notify(d.name, events, cookie, et, deleted) + d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted) } - d.inode.watches.Notify("", events, cookie, et, deleted) + d.inode.watches.Notify(ctx, "", events, cookie, et, deleted) d.inode.fs.mu.RUnlock() } @@ -278,7 +278,7 @@ func (d *dentry) Watches() *vfs.Watches { } // OnZeroWatches implements vfs.Dentry.OnZeroWatches. -func (d *dentry) OnZeroWatches() {} +func (d *dentry) OnZeroWatches(context.Context) {} // inode represents a filesystem object. type inode struct { @@ -359,12 +359,12 @@ func (i *inode) incLinksLocked() { // remove a reference on i as well. // // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. -func (i *inode) decLinksLocked() { +func (i *inode) decLinksLocked(ctx context.Context) { if i.nlink == 0 { panic("tmpfs.inode.decLinksLocked() called with no existing links") } if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 { - i.decRef() + i.decRef(ctx) } } @@ -386,9 +386,9 @@ func (i *inode) tryIncRef() bool { } } -func (i *inode) decRef() { +func (i *inode) decRef(ctx context.Context) { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { - i.watches.HandleDeletion() + i.watches.HandleDeletion(ctx) if regFile, ok := i.impl.(*regularFile); ok { // Release memory used by regFile to store data. Since regFile is // no longer usable, we don't need to grab any locks or update any @@ -558,6 +558,8 @@ func (i *inode) direntType() uint8 { return linux.DT_LNK case *socketFile: return linux.DT_SOCK + case *namedPipe: + return linux.DT_FIFO case *deviceFile: switch impl.kind { case vfs.BlockDevice: @@ -701,7 +703,7 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -724,7 +726,7 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -736,13 +738,13 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } // NewMemfd creates a new tmpfs regular file and file description that can back // an anonymous fd created by memfd_create. -func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) { +func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { fs, ok := mount.Filesystem().Impl().(*filesystem) if !ok { panic("NewMemfd() called with non-tmpfs mount") @@ -757,7 +759,7 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s } d := fs.newDentry(inode) - defer d.DecRef() + defer d.DecRef(ctx) d.name = name // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go index a240fb276..6f3e3ae6f 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -34,7 +34,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr creds := auth.CredentialsFromContext(ctx) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) } @@ -47,8 +47,8 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr } root := mntns.Root() return vfsObj, root, func() { - root.DecRef() - mntns.DecRef() + root.DecRef(ctx) + mntns.DecRef(ctx) }, nil } diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index 2916a0644..c0b4831d1 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -56,6 +56,12 @@ type Stack interface { // settings. SetTCPSACKEnabled(enabled bool) error + // TCPRecovery returns the TCP loss detection algorithm. + TCPRecovery() (TCPLossRecovery, error) + + // SetTCPRecovery attempts to change TCP loss detection algorithm. + SetTCPRecovery(recovery TCPLossRecovery) error + // Statistics reports stack statistics. Statistics(stat interface{}, arg string) error @@ -189,3 +195,14 @@ type StatSNMPUDP [8]uint64 // StatSNMPUDPLite describes UdpLite line of /proc/net/snmp. type StatSNMPUDPLite [8]uint64 + +// TCPLossRecovery indicates TCP loss detection and recovery methods to use. +type TCPLossRecovery int32 + +// Loss recovery constants from include/net/tcp.h which are used to set +// /proc/sys/net/ipv4/tcp_recovery. +const ( + TCP_RACK_LOSS_DETECTION TCPLossRecovery = 1 << iota + TCP_RACK_STATIC_REO_WND + TCP_RACK_NO_DUPTHRESH +) diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go index d8961fc94..9771f01fc 100644 --- a/pkg/sentry/inet/test_stack.go +++ b/pkg/sentry/inet/test_stack.go @@ -25,6 +25,7 @@ type TestStack struct { TCPRecvBufSize TCPBufferSize TCPSendBufSize TCPBufferSize TCPSACKFlag bool + Recovery TCPLossRecovery } // NewTestStack returns a TestStack with no network interfaces. The value of @@ -91,6 +92,17 @@ func (s *TestStack) SetTCPSACKEnabled(enabled bool) error { return nil } +// TCPRecovery implements Stack.TCPRecovery. +func (s *TestStack) TCPRecovery() (TCPLossRecovery, error) { + return s.Recovery, nil +} + +// SetTCPRecovery implements Stack.SetTCPRecovery. +func (s *TestStack) SetTCPRecovery(recovery TCPLossRecovery) error { + s.Recovery = recovery + return nil +} + // Statistics implements inet.Stack.Statistics. func (s *TestStack) Statistics(stat interface{}, arg string) error { return nil diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index 920fe4329..52ed5cea2 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -17,6 +17,7 @@ package kernel import ( "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" @@ -31,7 +32,7 @@ type abstractEndpoint struct { } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (e *abstractEndpoint) WeakRefGone() { +func (e *abstractEndpoint) WeakRefGone(context.Context) { e.ns.mu.Lock() if e.ns.endpoints[e.name].ep == e.ep { delete(e.ns.endpoints, e.name) @@ -64,9 +65,9 @@ type boundEndpoint struct { } // Release implements transport.BoundEndpoint.Release. -func (e *boundEndpoint) Release() { - e.rc.DecRef() - e.BoundEndpoint.Release() +func (e *boundEndpoint) Release(ctx context.Context) { + e.rc.DecRef(ctx) + e.BoundEndpoint.Release(ctx) } // BoundEndpoint retrieves the endpoint bound to the given name. The return @@ -93,13 +94,13 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp // // When the last reference managed by rc is dropped, ep may be removed from the // namespace. -func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { +func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { a.mu.Lock() defer a.mu.Unlock() if ep, ok := a.endpoints[name]; ok { if rc := ep.wr.Get(); rc != nil { - rc.DecRef() + rc.DecRef(ctx) return syscall.EADDRINUSE } } diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 4c0f1e41f..15519f0df 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -76,8 +76,8 @@ type pollEntry struct { // WeakRefGone implements refs.WeakRefUser.WeakRefGone. // weakReferenceGone is called when the file in the weak reference is destroyed. // The poll entry is removed in response to this. -func (p *pollEntry) WeakRefGone() { - p.epoll.RemoveEntry(p.id) +func (p *pollEntry) WeakRefGone(ctx context.Context) { + p.epoll.RemoveEntry(ctx, p.id) } // EventPoll holds all the state associated with an event poll object, that is, @@ -144,14 +144,14 @@ func NewEventPoll(ctx context.Context) *fs.File { // name matches fs/eventpoll.c:epoll_create1. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]")) // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{ files: make(map[FileIdentifier]*pollEntry), }) } // Release implements fs.FileOperations.Release. -func (e *EventPoll) Release() { +func (e *EventPoll) Release(ctx context.Context) { // We need to take the lock now because files may be attempting to // remove entries in parallel if they get destroyed. e.mu.Lock() @@ -160,7 +160,7 @@ func (e *EventPoll) Release() { // Go through all entries and clean up. for _, entry := range e.files { entry.id.File.EventUnregister(&entry.waiter) - entry.file.Drop() + entry.file.Drop(ctx) } e.files = nil } @@ -423,7 +423,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter } // RemoveEntry a files from the collection of observed files. -func (e *EventPoll) RemoveEntry(id FileIdentifier) error { +func (e *EventPoll) RemoveEntry(ctx context.Context, id FileIdentifier) error { e.mu.Lock() defer e.mu.Unlock() @@ -445,7 +445,7 @@ func (e *EventPoll) RemoveEntry(id FileIdentifier) error { // Remove file from map, and drop weak reference. delete(e.files, id) - entry.file.Drop() + entry.file.Drop(ctx) return nil } diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go index 22630e9c5..55b505593 100644 --- a/pkg/sentry/kernel/epoll/epoll_test.go +++ b/pkg/sentry/kernel/epoll/epoll_test.go @@ -26,7 +26,8 @@ func TestFileDestroyed(t *testing.T) { f := filetest.NewTestFile(t) id := FileIdentifier{f, 12} - efile := NewEventPoll(contexttest.Context(t)) + ctx := contexttest.Context(t) + efile := NewEventPoll(ctx) e := efile.FileOperations.(*EventPoll) if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil { t.Fatalf("addEntry failed: %v", err) @@ -44,7 +45,7 @@ func TestFileDestroyed(t *testing.T) { } // Destroy the file. Check that we get no more events. - f.DecRef() + f.DecRef(ctx) evt = e.ReadEvents(1) if len(evt) != 0 { diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 87951adeb..bbf568dfc 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -70,7 +70,7 @@ func New(ctx context.Context, initVal uint64, semMode bool) *fs.File { // name matches fs/eventfd.c:eventfd_file_create. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[eventfd]") // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{ val: initVal, semMode: semMode, @@ -106,7 +106,7 @@ func (e *EventOperations) HostFD() (int, error) { } // Release implements fs.FileOperations.Release. -func (e *EventOperations) Release() { +func (e *EventOperations) Release(context.Context) { e.mu.Lock() defer e.mu.Unlock() if e.hostfd >= 0 { diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 4b7d234a4..ce53af69b 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -98,7 +98,7 @@ type FDTable struct { func (f *FDTable) saveDescriptorTable() map[int32]descriptor { m := make(map[int32]descriptor) - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { m[fd] = descriptor{ file: file, fileVFS2: fileVFS2, @@ -109,6 +109,7 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { } func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { + ctx := context.Background() f.init() // Initialize table. for fd, d := range m { f.setAll(fd, d.file, d.fileVFS2, d.flags) @@ -118,9 +119,9 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { // reference taken by set above. switch { case d.file != nil: - d.file.DecRef() + d.file.DecRef(ctx) case d.fileVFS2 != nil: - d.fileVFS2.DecRef() + d.fileVFS2.DecRef(ctx) } } } @@ -144,14 +145,15 @@ func (f *FDTable) drop(file *fs.File) { d.InotifyEvent(ev, 0) // Drop the table reference. - file.DecRef() + file.DecRef(context.Background()) } // dropVFS2 drops the table reference. func (f *FDTable) dropVFS2(file *vfs.FileDescription) { // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the // entire file. - err := file.UnlockPOSIX(context.Background(), f, 0, 0, linux.SEEK_SET) + ctx := context.Background() + err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET) if err != nil && err != syserror.ENOLCK { panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) } @@ -161,10 +163,10 @@ func (f *FDTable) dropVFS2(file *vfs.FileDescription) { if file.IsWritable() { ev = linux.IN_CLOSE_WRITE } - file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(ctx, ev, 0, vfs.PathEvent) // Drop the table's reference. - file.DecRef() + file.DecRef(ctx) } // NewFDTable allocates a new FDTable that may be used by tasks in k. @@ -175,15 +177,15 @@ func (k *Kernel) NewFDTable() *FDTable { } // destroy removes all of the file descriptors from the map. -func (f *FDTable) destroy() { - f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool { +func (f *FDTable) destroy(ctx context.Context) { + f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { return true }) } // DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FDTable) DecRef() { - f.DecRefWithDestructor(f.destroy) +func (f *FDTable) DecRef(ctx context.Context) { + f.DecRefWithDestructor(ctx, f.destroy) } // Size returns the number of file descriptor slots currently allocated. @@ -195,7 +197,7 @@ func (f *FDTable) Size() int { // forEach iterates over all non-nil files in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. -func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { +func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { // retries tracks the number of failed TryIncRef attempts for the same FD. retries := 0 fd := int32(0) @@ -214,7 +216,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes continue // Race caught. } fn(fd, file, nil, flags) - file.DecRef() + file.DecRef(ctx) case fileVFS2 != nil: if !fileVFS2.TryIncRef() { retries++ @@ -224,7 +226,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes continue // Race caught. } fn(fd, nil, fileVFS2, flags) - fileVFS2.DecRef() + fileVFS2.DecRef(ctx) } retries = 0 fd++ @@ -234,7 +236,8 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes // String is a stringer for FDTable. func (f *FDTable) String() string { var buf strings.Builder - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + ctx := context.Background() + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { switch { case file != nil: n, _ := file.Dirent.FullName(nil /* root */) @@ -242,7 +245,7 @@ func (f *FDTable) String() string { case fileVFS2 != nil: vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem() - name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) + name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) if err != nil { fmt.Fprintf(&buf, "<err: %v>\n", err) return @@ -541,9 +544,9 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { // // Precondition: The caller must be running on the task goroutine, or Task.mu // must be locked. -func (f *FDTable) GetFDs() []int32 { +func (f *FDTable) GetFDs(ctx context.Context) []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) - f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { + f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { fds = append(fds, fd) }) return fds @@ -552,9 +555,9 @@ func (f *FDTable) GetFDs() []int32 { // GetRefs returns a stable slice of references to all files and bumps the // reference count on each. The caller must use DecRef on each reference when // they're done using the slice. -func (f *FDTable) GetRefs() []*fs.File { +func (f *FDTable) GetRefs(ctx context.Context) []*fs.File { files := make([]*fs.File, 0, f.Size()) - f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + f.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { file.IncRef() // Acquire a reference for caller. files = append(files, file) }) @@ -564,9 +567,9 @@ func (f *FDTable) GetRefs() []*fs.File { // GetRefsVFS2 returns a stable slice of references to all files and bumps the // reference count on each. The caller must use DecRef on each reference when // they're done using the slice. -func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { +func (f *FDTable) GetRefsVFS2(ctx context.Context) []*vfs.FileDescription { files := make([]*vfs.FileDescription, 0, f.Size()) - f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { + f.forEach(ctx, func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { file.IncRef() // Acquire a reference for caller. files = append(files, file) }) @@ -574,10 +577,10 @@ func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { } // Fork returns an independent FDTable. -func (f *FDTable) Fork() *FDTable { +func (f *FDTable) Fork(ctx context.Context) *FDTable { clone := f.k.NewFDTable() - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { // The set function here will acquire an appropriate table // reference for the clone. We don't need anything else. switch { @@ -622,11 +625,11 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { } // RemoveIf removes all FDs where cond is true. -func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { +func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { f.mu.Lock() defer f.mu.Unlock() - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { if cond(file, fileVFS2, flags) { f.set(fd, nil, FDFlags{}) // Clear from table. // Update current available position. diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 29f95a2c4..e3f30ba2a 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -154,7 +154,7 @@ func TestFDTable(t *testing.T) { if ref == nil { t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") } - ref.DecRef() + ref.DecRef(ctx) if ref, _ := fdTable.Remove(1); ref != nil { t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") @@ -191,7 +191,7 @@ func BenchmarkFDLookupAndDecRef(b *testing.B) { b.StartTimer() // Benchmark. for i := 0; i < b.N; i++ { tf, _ := fdTable.Get(fds[i%len(fds)]) - tf.DecRef() + tf.DecRef(ctx) } }) } @@ -219,7 +219,7 @@ func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) { defer wg.Done() for i := 0; i < each; i++ { tf, _ := fdTable.Get(fds[i%len(fds)]) - tf.DecRef() + tf.DecRef(ctx) } }() } diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 47f78df9a..8f2d36d5a 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -17,6 +17,7 @@ package kernel import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -89,28 +90,28 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { // Note that there may still be calls to WorkingDirectory() or RootDirectory() // (that return nil). This is because valid references may still be held via // proc files or other mechanisms. -func (f *FSContext) destroy() { +func (f *FSContext) destroy(ctx context.Context) { // Hold f.mu so that we don't race with RootDirectory() and // WorkingDirectory(). f.mu.Lock() defer f.mu.Unlock() if VFS2Enabled { - f.rootVFS2.DecRef() + f.rootVFS2.DecRef(ctx) f.rootVFS2 = vfs.VirtualDentry{} - f.cwdVFS2.DecRef() + f.cwdVFS2.DecRef(ctx) f.cwdVFS2 = vfs.VirtualDentry{} } else { - f.root.DecRef() + f.root.DecRef(ctx) f.root = nil - f.cwd.DecRef() + f.cwd.DecRef(ctx) f.cwd = nil } } // DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FSContext) DecRef() { - f.DecRefWithDestructor(f.destroy) +func (f *FSContext) DecRef(ctx context.Context) { + f.DecRefWithDestructor(ctx, f.destroy) } // Fork forks this FSContext. @@ -165,7 +166,7 @@ func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { // This will take an extra reference on the Dirent. // // This is not a valid call after destroy. -func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { +func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetWorkingDirectory called with nil dirent") } @@ -180,21 +181,21 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { old := f.cwd f.cwd = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // SetWorkingDirectoryVFS2 sets the current working directory. // This will take an extra reference on the VirtualDentry. // // This is not a valid call after destroy. -func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) { +func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) { f.mu.Lock() defer f.mu.Unlock() old := f.cwdVFS2 f.cwdVFS2 = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // RootDirectory returns the current filesystem root. @@ -226,7 +227,7 @@ func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { // This will take an extra reference on the Dirent. // // This is not a valid call after free. -func (f *FSContext) SetRootDirectory(d *fs.Dirent) { +func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetRootDirectory called with nil dirent") } @@ -241,13 +242,13 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) { old := f.root f.root = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. // // This is not a valid call after free. -func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { +func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) { if !vd.Ok() { panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") } @@ -263,7 +264,7 @@ func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { vd.IncRef() f.rootVFS2 = vd f.mu.Unlock() - old.DecRef() + old.DecRef(ctx) } // Umask returns the current umask. diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index c5021f2db..daa2dae76 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -51,6 +51,7 @@ go_test( srcs = ["futex_test.go"], library = ":futex", deps = [ + "//pkg/context", "//pkg/sync", "//pkg/usermem", ], diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index bcc1b29a8..e4dcc4d40 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -19,6 +19,7 @@ package futex import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -66,9 +67,9 @@ type Key struct { Offset uint64 } -func (k *Key) release() { +func (k *Key) release(t Target) { if k.MappingIdentity != nil { - k.MappingIdentity.DecRef() + k.MappingIdentity.DecRef(t) } k.Mappable = nil k.MappingIdentity = nil @@ -94,6 +95,8 @@ func (k *Key) matches(k2 *Key) bool { // Target abstracts memory accesses and keys. type Target interface { + context.Context + // SwapUint32 gives access to usermem.IO.SwapUint32. SwapUint32(addr usermem.Addr, new uint32) (uint32, error) @@ -296,7 +299,7 @@ func (b *bucket) wakeWaiterLocked(w *Waiter) { // bucket "to". // // Preconditions: b and to must be locked. -func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { +func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int { done := 0 for w := b.waiters.Front(); done < n && w != nil; { if !w.key.matches(key) { @@ -308,7 +311,7 @@ func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { requeued := w w = w.Next() // Next iteration. b.waiters.Remove(requeued) - requeued.key.release() + requeued.key.release(t) requeued.key = nkey.clone() to.waiters.PushBack(requeued) requeued.bucket.Store(to) @@ -456,7 +459,7 @@ func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32 r := b.wakeLocked(&k, bitmask, n) b.mu.Unlock() - k.release() + k.release(t) return r, nil } @@ -465,12 +468,12 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch if err != nil { return 0, err } - defer k1.release() + defer k1.release(t) k2, err := getKey(t, naddr, private) if err != nil { return 0, err } - defer k2.release() + defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) defer b1.mu.Unlock() @@ -488,7 +491,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch done := b1.wakeLocked(&k1, ^uint32(0), nwake) // Requeue the number required. - b1.requeueLocked(b2, &k1, &k2, nreq) + b1.requeueLocked(t, b2, &k1, &k2, nreq) return done, nil } @@ -515,12 +518,12 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwak if err != nil { return 0, err } - defer k1.release() + defer k1.release(t) k2, err := getKey(t, addr2, private) if err != nil { return 0, err } - defer k2.release() + defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) defer b1.mu.Unlock() @@ -571,7 +574,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo // Perform our atomic check. if err := check(t, addr, val); err != nil { b.mu.Unlock() - w.key.release() + w.key.release(t) return err } @@ -585,7 +588,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo // WaitComplete must be called when a Waiter previously added by WaitPrepare is // no longer eligible to be woken. -func (m *Manager) WaitComplete(w *Waiter) { +func (m *Manager) WaitComplete(w *Waiter, t Target) { // Remove w from the bucket it's in. for { b := w.bucket.Load() @@ -617,7 +620,7 @@ func (m *Manager) WaitComplete(w *Waiter) { } // Release references held by the waiter. - w.key.release() + w.key.release(t) } // LockPI attempts to lock the futex following the Priority-inheritance futex @@ -648,13 +651,13 @@ func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, pri success, err := m.lockPILocked(w, t, addr, tid, b, try) if err != nil { - w.key.release() + w.key.release(t) b.mu.Unlock() return false, err } if success || try { // Release waiter if it's not going to be a wait. - w.key.release() + w.key.release(t) } b.mu.Unlock() return success, nil @@ -730,7 +733,7 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool err = m.unlockPILocked(t, addr, tid, b, &k) - k.release() + k.release(t) b.mu.Unlock() return err } diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index 7c5c7665b..d0128c548 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -22,6 +22,7 @@ import ( "testing" "unsafe" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) @@ -29,28 +30,33 @@ import ( // testData implements the Target interface, and allows us to // treat the address passed for futex operations as an index in // a byte slice for testing simplicity. -type testData []byte +type testData struct { + context.Context + data []byte +} const sizeofInt32 = 4 func newTestData(size uint) testData { - return make([]byte, size) + return testData{ + data: make([]byte, size), + } } func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) { - val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t[addr])), new) + val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), new) return val, nil } func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) { - if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t[addr])), old, new) { + if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), old, new) { return old, nil } - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) { - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) { @@ -83,7 +89,7 @@ func TestFutexWake(t *testing.T) { // Start waiting for wakeup. w := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w) + defer m.WaitComplete(w, d) // Perform a wakeup. if n, err := m.Wake(d, 0, private, ^uint32(0), 1); err != nil || n != 1 { @@ -106,7 +112,7 @@ func TestFutexWakeBitmask(t *testing.T) { // Start waiting for wakeup. w := newPreparedTestWaiter(t, m, d, 0, private, 0, 0x0000ffff) - defer m.WaitComplete(w) + defer m.WaitComplete(w, d) // Perform a wakeup using the wrong bitmask. if n, err := m.Wake(d, 0, private, 0xffff0000, 1); err != nil || n != 0 { @@ -141,7 +147,7 @@ func TestFutexWakeTwo(t *testing.T) { var ws [3]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform two wakeups. @@ -174,9 +180,9 @@ func TestFutexWakeUnrelated(t *testing.T) { // Start two waiters waiting for wakeup on different addresses. w1 := newPreparedTestWaiter(t, m, d, 0*sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 1*sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform two wakeups on the second address. if n, err := m.Wake(d, 1*sizeofInt32, private, ^uint32(0), 2); err != nil || n != 1 { @@ -216,9 +222,9 @@ func TestWakeOpFirstNonEmpty(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address 0. if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 0, 0); err != nil || n != 2 { @@ -244,9 +250,9 @@ func TestWakeOpSecondNonEmpty(t *testing.T) { // Add two waiters on address sizeofInt32. w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address sizeofInt32 (contingent on // d.Op(0), which should succeed). @@ -273,9 +279,9 @@ func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) { // Add two waiters on address sizeofInt32. w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address sizeofInt32 (contingent on // d.Op(1), which should fail). @@ -302,15 +308,15 @@ func TestWakeOpAllNonEmpty(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Add two waiters on address sizeofInt32. w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w3) + defer m.WaitComplete(w3, d) w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w4) + defer m.WaitComplete(w4, d) // Perform 10 wakeups on address 0 (unconditionally), and 10 // wakeups on address sizeofInt32 (contingent on d.Op(0), which @@ -344,15 +350,15 @@ func TestWakeOpAllNonEmptyFailingOp(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Add two waiters on address sizeofInt32. w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w3) + defer m.WaitComplete(w3, d) w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w4) + defer m.WaitComplete(w4, d) // Perform 10 wakeups on address 0 (unconditionally), and 10 // wakeups on address sizeofInt32 (contingent on d.Op(1), which @@ -388,7 +394,7 @@ func TestWakeOpSameAddress(t *testing.T) { var ws [4]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup @@ -422,7 +428,7 @@ func TestWakeOpSameAddressFailingOp(t *testing.T) { var ws [4]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup @@ -472,7 +478,7 @@ func (t *testMutex) Lock() { for { // Attempt to grab the lock. if atomic.CompareAndSwapUint32( - (*uint32)(unsafe.Pointer(&t.d[t.a])), + (*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked, testMutexLocked) { // Lock held. @@ -490,7 +496,7 @@ func (t *testMutex) Lock() { panic("WaitPrepare returned unexpected error: " + err.Error()) } <-w.C - t.m.WaitComplete(w) + t.m.WaitComplete(w, t.d) } } @@ -498,7 +504,7 @@ func (t *testMutex) Lock() { // This will notify any waiters via the futex manager. func (t *testMutex) Unlock() { // Unlock. - atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d[t.a])), testMutexUnlocked) + atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked) // Notify all waiters. t.m.Wake(t.d, t.a, true, ^uint32(0), math.MaxInt32) diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 15dae0f5b..1028d13c6 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -376,7 +376,8 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.netlinkPorts = port.New() if VFS2Enabled { - if err := k.vfs.Init(); err != nil { + ctx := k.SupervisorContext() + if err := k.vfs.Init(ctx); err != nil { return fmt.Errorf("failed to initialize VFS: %v", err) } @@ -384,19 +385,19 @@ func (k *Kernel) Init(args InitKernelArgs) error { if err != nil { return fmt.Errorf("failed to create pipefs filesystem: %v", err) } - defer pipeFilesystem.DecRef() + defer pipeFilesystem.DecRef(ctx) pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create pipefs mount: %v", err) } k.pipeMount = pipeMount - tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) + tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) if err != nil { return fmt.Errorf("failed to create tmpfs filesystem: %v", err) } - defer tmpfsFilesystem.DecRef() - defer tmpfsRoot.DecRef() + defer tmpfsFilesystem.DecRef(ctx) + defer tmpfsRoot.DecRef(ctx) shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create tmpfs mount: %v", err) @@ -407,7 +408,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { if err != nil { return fmt.Errorf("failed to create sockfs filesystem: %v", err) } - defer socketFilesystem.DecRef() + defer socketFilesystem.DecRef(ctx) socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create sockfs mount: %v", err) @@ -430,8 +431,8 @@ func (k *Kernel) SaveTo(w wire.Writer) error { defer k.extMu.Unlock() // Stop time. - k.pauseTimeLocked() - defer k.resumeTimeLocked() + k.pauseTimeLocked(ctx) + defer k.resumeTimeLocked(ctx) // Evict all evictable MemoryFile allocations. k.mf.StartEvictions() @@ -447,12 +448,12 @@ func (k *Kernel) SaveTo(w wire.Writer) error { // Remove all epoll waiter objects from underlying wait queues. // NOTE: for programs to resume execution in future snapshot scenarios, // we will need to re-establish these waiter objects after saving. - k.tasks.unregisterEpollWaiters() + k.tasks.unregisterEpollWaiters(ctx) // Clear the dirent cache before saving because Dirents must be Loaded in a // particular order (parents before children), and Loading dirents from a cache // breaks that order. - if err := k.flushMountSourceRefs(); err != nil { + if err := k.flushMountSourceRefs(ctx); err != nil { return err } @@ -505,7 +506,7 @@ func (k *Kernel) SaveTo(w wire.Writer) error { // flushMountSourceRefs flushes the MountSources for all mounted filesystems // and open FDs. -func (k *Kernel) flushMountSourceRefs() error { +func (k *Kernel) flushMountSourceRefs(ctx context.Context) error { // Flush all mount sources for currently mounted filesystems in each task. flushed := make(map[*fs.MountNamespace]struct{}) k.tasks.mu.RLock() @@ -521,7 +522,7 @@ func (k *Kernel) flushMountSourceRefs() error { // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) @@ -531,7 +532,7 @@ func (k *Kernel) flushMountSourceRefs() error { // each task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { +func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) { // TODO(gvisor.dev/issue/1663): Add save support for VFS2. if VFS2Enabled { return nil @@ -544,7 +545,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) if t.fdTable == nil { continue } - t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { err = lastErr } @@ -555,7 +556,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil } @@ -602,7 +603,7 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { return nil } -func (ts *TaskSet) unregisterEpollWaiters() { +func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) { // TODO(gvisor.dev/issue/1663): Add save support for VFS2. if VFS2Enabled { return @@ -623,7 +624,7 @@ func (ts *TaskSet) unregisterEpollWaiters() { if _, ok := processed[t.fdTable]; ok { continue } - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if e, ok := file.FileOperations.(*epoll.EventPoll); ok { e.UnregisterEpollWaiters() } @@ -900,7 +901,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, root := args.MountNamespaceVFS2.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. - defer root.DecRef() + defer root.DecRef(ctx) // Grab the working directory. wd := root // Default. @@ -918,7 +919,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } - defer wd.DecRef() + defer wd.DecRef(ctx) } opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) fsContext = NewFSContextVFS2(root, wd, args.Umask) @@ -933,7 +934,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, root := mntns.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. - defer root.DecRef() + defer root.DecRef(ctx) // Grab the working directory. remainingTraversals := args.MaxSymlinkTraversals @@ -944,7 +945,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } - defer wd.DecRef() + defer wd.DecRef(ctx) } opener = fsbridge.NewFSLookup(mntns, root, wd) fsContext = newFSContext(root, wd, args.Umask) @@ -1054,7 +1055,7 @@ func (k *Kernel) Start() error { // If k was created by LoadKernelFrom, timers were stopped during // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, // this is a no-op. - k.resumeTimeLocked() + k.resumeTimeLocked(k.SupervisorContext()) // Start task goroutines. k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() @@ -1068,7 +1069,7 @@ func (k *Kernel) Start() error { // // Preconditions: Any task goroutines running in k must be stopped. k.extMu // must be locked. -func (k *Kernel) pauseTimeLocked() { +func (k *Kernel) pauseTimeLocked(ctx context.Context) { // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before // Kernel.Start(). if k.cpuClockTicker != nil { @@ -1090,7 +1091,7 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.PauseTimer() @@ -1112,7 +1113,7 @@ func (k *Kernel) pauseTimeLocked() { // // Preconditions: Any task goroutines running in k must be stopped. k.extMu // must be locked. -func (k *Kernel) resumeTimeLocked() { +func (k *Kernel) resumeTimeLocked(ctx context.Context) { if k.cpuClockTicker != nil { k.cpuClockTicker.Resume() } @@ -1126,7 +1127,7 @@ func (k *Kernel) resumeTimeLocked() { } } if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.ResumeTimer() @@ -1262,6 +1263,13 @@ func (k *Kernel) Pause() { k.tasks.aioGoroutines.Wait() } +// ReceiveTaskStates receives full states for all tasks. +func (k *Kernel) ReceiveTaskStates() { + k.extMu.Lock() + k.tasks.PullFullState() + k.extMu.Unlock() +} + // Unpause ends the effect of a previous call to Pause. If Unpause is called // without a matching preceding call to Pause, Unpause may panic. func (k *Kernel) Unpause() { @@ -1511,7 +1519,7 @@ type SocketEntry struct { } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (s *SocketEntry) WeakRefGone() { +func (s *SocketEntry) WeakRefGone(context.Context) { s.k.extMu.Lock() s.k.sockets.Remove(s) s.k.extMu.Unlock() @@ -1600,7 +1608,7 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return vfs.VirtualDentry{} } mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() - defer mntns.DecRef() + defer mntns.DecRef(ctx) // Root() takes a reference on the root dirent for us. return mntns.Root() case vfs.CtxMountNamespace: diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 4b688c627..6497dc4ba 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -93,7 +93,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { if !waitFor(&i.mu, &i.wWakeup, ctx) { - r.DecRef() + r.DecRef(ctx) return nil, syserror.ErrInterrupted } } @@ -111,12 +111,12 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi // On a nonblocking, write-only open, the open fails with ENXIO if the // read side isn't open yet. if flags.NonBlocking { - w.DecRef() + w.DecRef(ctx) return nil, syserror.ENXIO } if !waitFor(&i.mu, &i.rWakeup, ctx) { - w.DecRef() + w.DecRef(ctx) return nil, syserror.ErrInterrupted } } diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index ab75a87ff..ce0db5583 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -167,7 +167,7 @@ func TestClosedReaderBlocksWriteOpen(t *testing.T) { f := NewInodeOperations(ctx, perms, newNamedPipe(t)) rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil) - rFile.DecRef() + rFile.DecRef(ctx) wDone := make(chan struct{}) // This open for write should block because the reader is now gone. diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 79645d7d2..297e8f28f 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -152,7 +152,7 @@ func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs. d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino)) // The p.Open calls below will each take a reference on the Dirent. We // must drop the one we already have. - defer d.DecRef() + defer d.DecRef(ctx) return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true}) } diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go index bda739dbe..fe97e9800 100644 --- a/pkg/sentry/kernel/pipe/pipe_test.go +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -27,8 +27,8 @@ import ( func TestPipeRW(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, 65536, 4096) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := []byte("here's some bytes") wantN := int64(len(msg)) @@ -47,8 +47,8 @@ func TestPipeRW(t *testing.T) { func TestPipeReadBlock(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, 65536, 4096) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1))) if n != 0 || err != syserror.ErrWouldBlock { @@ -62,8 +62,8 @@ func TestPipeWriteBlock(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := make([]byte, capacity+1) n, err := w.Writev(ctx, usermem.BytesIOSequence(msg)) @@ -77,8 +77,8 @@ func TestPipeWriteUntilEnd(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := []byte("here's some bytes") diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index aacf28da2..6d58b682f 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -33,7 +33,7 @@ import ( // the old fs architecture. // Release cleans up the pipe's state. -func (p *Pipe) Release() { +func (p *Pipe) Release(context.Context) { p.rClose() p.wClose() diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go index 7724b4452..ac18785c0 100644 --- a/pkg/sentry/kernel/pipe/reader.go +++ b/pkg/sentry/kernel/pipe/reader.go @@ -15,6 +15,7 @@ package pipe import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/waiter" ) @@ -29,7 +30,7 @@ type Reader struct { // Release implements fs.FileOperations.Release. // // This overrides ReaderWriter.Release. -func (r *Reader) Release() { +func (r *Reader) Release(context.Context) { r.Pipe.rClose() // Wake up writers. diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 45d4c5fc1..28f998e45 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -101,7 +101,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // If this pipe is being opened as blocking and there's no // writer, we have to wait for a writer to open the other end. if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EINTR } @@ -112,12 +112,12 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // Non-blocking, write-only opens fail with ENXIO when the read // side isn't open yet. if statusFlags&linux.O_NONBLOCK != 0 { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.ENXIO } // Wait for a reader to open the other end. if !waitFor(&vp.mu, &vp.rWakeup, ctx) { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EINTR } } @@ -169,7 +169,7 @@ type VFSPipeFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *VFSPipeFD) Release() { +func (fd *VFSPipeFD) Release(context.Context) { var event waiter.EventMask if fd.vfsfd.IsReadable() { fd.pipe.rClose() diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go index 5bc6aa931..ef4b70ca3 100644 --- a/pkg/sentry/kernel/pipe/writer.go +++ b/pkg/sentry/kernel/pipe/writer.go @@ -15,6 +15,7 @@ package pipe import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/waiter" ) @@ -29,7 +30,7 @@ type Writer struct { // Release implements fs.FileOperations.Release. // // This overrides ReaderWriter.Release. -func (w *Writer) Release() { +func (w *Writer) Release(context.Context) { w.Pipe.wClose() // Wake up readers. diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index e23e796ef..6c03d9041 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -1018,6 +1018,9 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + ar := ars.Head() n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, @@ -1044,10 +1047,14 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + + mm := t.MemoryManager() + t.p.PullFullState(mm.AddressSpace(), t.Arch()) + ar := ars.Head() n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, - IO: t.MemoryManager(), + IO: mm, Addr: ar.Start, Opts: usermem.IOOpts{ AddressSpaceActive: true, @@ -1056,6 +1063,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + t.p.FloatingPointStateChanged() ar.End -= usermem.Addr(n) return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 0e19286de..5c4c622c2 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -16,6 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" @@ -70,7 +71,7 @@ func (s *Session) incRef() { // // Precondition: callers must hold TaskSet.mu for writing. func (s *Session) decRef() { - s.refs.DecRefWithDestructor(func() { + s.refs.DecRefWithDestructor(nil, func(context.Context) { // Remove translations from the leader. for ns := s.leader.pidns; ns != nil; ns = ns.parent { id := ns.sids[s] @@ -162,7 +163,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { } alive := true - pg.refs.DecRefWithDestructor(func() { + pg.refs.DecRefWithDestructor(nil, func(context.Context) { alive = false // don't bother with handleOrphan. // Remove translations from the originator. diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 55b4c2cdb..13ec7afe0 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -431,8 +431,8 @@ func (s *Shm) InodeID() uint64 { // DecRef overrides refs.RefCount.DecRef with a destructor. // // Precondition: Caller must not hold s.mu. -func (s *Shm) DecRef() { - s.DecRefWithDestructor(s.destroy) +func (s *Shm) DecRef(ctx context.Context) { + s.DecRefWithDestructor(ctx, s.destroy) } // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm @@ -642,7 +642,7 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { return nil } -func (s *Shm) destroy() { +func (s *Shm) destroy(context.Context) { s.mfp.MemoryFile().DecRef(s.fr) s.registry.remove(s) } @@ -651,7 +651,7 @@ func (s *Shm) destroy() { // destroyed once it has no references. MarkDestroyed may be called multiple // times, and is safe to call after a segment has already been destroyed. See // shmctl(IPC_RMID). -func (s *Shm) MarkDestroyed() { +func (s *Shm) MarkDestroyed(ctx context.Context) { s.registry.dissociateKey(s) s.mu.Lock() @@ -663,7 +663,7 @@ func (s *Shm) MarkDestroyed() { // // N.B. This cannot be the final DecRef, as the caller also // holds a reference. - s.DecRef() + s.DecRef(ctx) return } } diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 8243bb93e..b07e1c1bd 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -76,7 +76,7 @@ func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) { } // Release implements fs.FileOperations.Release. -func (s *SignalOperations) Release() {} +func (s *SignalOperations) Release(context.Context) {} // Mask returns the signal mask. func (s *SignalOperations) Mask() linux.SignalSet { diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c4db05bd8..5aee699e7 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -730,17 +730,17 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { func (t *Task) IsChrooted() bool { if VFS2Enabled { realRoot := t.mountNamespaceVFS2.Root() - defer realRoot.DecRef() + defer realRoot.DecRef(t) root := t.fsContext.RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) return root != realRoot } realRoot := t.tg.mounts.Root() - defer realRoot.DecRef() + defer realRoot.DecRef(t) root := t.fsContext.RootDirectory() if root != nil { - defer root.DecRef() + defer root.DecRef(t) } return root != realRoot } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index e1ecca99e..9d7a9128f 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -161,6 +161,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { return 0, nil, syserror.EINVAL } + // Pull task registers and FPU state, a cloned task will inherit the + // state of the current task. + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a // single clone(2) or unshare(2) call, the user namespace is guaranteed to // be created first, giving the child (clone(2)) or caller (unshare(2)) @@ -237,7 +241,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { var fdTable *FDTable if opts.NewFiles { - fdTable = t.fdTable.Fork() + fdTable = t.fdTable.Fork(t) } else { fdTable = t.fdTable fdTable.IncRef() @@ -294,7 +298,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { nt, err := t.tg.pidns.owner.NewTask(cfg) if err != nil { if opts.NewThreadGroup { - tg.release() + tg.release(t) } return 0, nil, err } @@ -510,7 +514,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { var oldFDTable *FDTable if opts.NewFiles { oldFDTable = t.fdTable - t.fdTable = oldFDTable.Fork() + t.fdTable = oldFDTable.Fork(t) } var oldFSContext *FSContext if opts.NewFSContext { @@ -519,10 +523,10 @@ func (t *Task) Unshare(opts *SharingOptions) error { } t.mu.Unlock() if oldFDTable != nil { - oldFDTable.DecRef() + oldFDTable.DecRef(t) } if oldFSContext != nil { - oldFSContext.DecRef() + oldFSContext.DecRef(t) } return nil } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 7803b98d0..47c28b8ff 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -199,11 +199,11 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Unlock() oldFDTable := t.fdTable - t.fdTable = t.fdTable.Fork() - oldFDTable.DecRef() + t.fdTable = t.fdTable.Fork(t) + oldFDTable.DecRef(t) // Remove FDs with the CloseOnExec flag set. - t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { + t.fdTable.RemoveIf(t, func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { return flags.CloseOnExec }) diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 231ac548a..c165d6cb1 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -269,12 +269,12 @@ func (*runExitMain) execute(t *Task) taskRunState { // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() - t.fsContext.DecRef() - t.fdTable.DecRef() + t.fsContext.DecRef(t) + t.fdTable.DecRef(t) t.mu.Lock() if t.mountNamespaceVFS2 != nil { - t.mountNamespaceVFS2.DecRef() + t.mountNamespaceVFS2.DecRef(t) t.mountNamespaceVFS2 = nil } t.mu.Unlock() @@ -282,7 +282,7 @@ func (*runExitMain) execute(t *Task) taskRunState { // If this is the last task to exit from the thread group, release the // thread group's resources. if lastExiter { - t.tg.release() + t.tg.release(t) } // Detach tracees. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index eeccaa197..d23cea802 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -27,6 +27,9 @@ const ( // maxStackDebugBytes is the maximum number of user stack bytes that may be // printed by debugDumpStack. maxStackDebugBytes = 1024 + // maxCodeDebugBytes is the maximum number of user code bytes that may be + // printed by debugDumpCode. + maxCodeDebugBytes = 128 ) // Infof logs an formatted info message by calling log.Infof. @@ -61,6 +64,7 @@ func (t *Task) IsLogging(level log.Level) bool { func (t *Task) DebugDumpState() { t.debugDumpRegisters() t.debugDumpStack() + t.debugDumpCode() if mm := t.MemoryManager(); mm != nil { t.Debugf("Mappings:\n%s", mm) } @@ -128,6 +132,45 @@ func (t *Task) debugDumpStack() { } } +// debugDumpCode logs user code contents at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) debugDumpCode() { + if !t.IsLogging(log.Debug) { + return + } + m := t.MemoryManager() + if m == nil { + t.Debugf("Memory manager for task is gone, skipping application code dump.") + return + } + t.Debugf("Code:") + // Print code on both sides of the instruction register. + start := usermem.Addr(t.Arch().IP()) - maxCodeDebugBytes/2 + // Round addr down to a 16-byte boundary. + start &= ^usermem.Addr(15) + // Print 16 bytes per line, one byte at a time. + for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 { + addr, ok := start.AddLength(offset) + if !ok { + break + } + var data [16]byte + n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{ + IgnorePermissions: true, + }) + // Print as much of the line as we can, even if an error was + // encountered. + if n > 0 { + t.Debugf("%x: % x", addr, data[:n]) + } + if err != nil { + t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err) + break + } + } +} + // trace definitions. // // Note that all region names are prefixed by ':' in order to ensure that they @@ -203,6 +246,6 @@ func (t *Task) traceExecEvent(tc *TaskContext) { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return } - defer file.DecRef() + defer file.DecRef(t) trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t)) } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 7d4f44caf..abaf29216 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -260,7 +260,7 @@ func (app *runApp) execute(t *Task) taskRunState { region := trace.StartRegion(t.traceContext, runRegion) t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) - info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) + info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU) t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) region.End() diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 79766cafe..2180fd27d 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -255,10 +255,11 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) } } + mm := t.MemoryManager() // Set up the signal handler. If we have a saved signal mask, the signal // handler should run with the current mask, but sigreturn should restore // the saved one. - st := &arch.Stack{t.Arch(), t.MemoryManager(), sp} + st := &arch.Stack{t.Arch(), mm, sp} mask := t.signalMask if t.haveSavedSignalMask { mask = t.savedSignalMask @@ -273,12 +274,13 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) // Please see the linux code as reference: // linux/arch/arm64/kernel/signal.c:setup_return() if act.Flags&linux.SA_RESTORER == 0 { - act.Restorer = t.MemoryManager().VDSOSigReturn() + act.Restorer = mm.VDSOSigReturn() } if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil { return err } + t.p.FloatingPointStateChanged() t.haveSavedSignalMask = false // Add our signal mask. @@ -310,6 +312,7 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) { // Restore our signal mask. SIGKILL and SIGSTOP should not be blocked. t.SetSignalMask(sigset &^ UnblockableSignals) + t.p.FloatingPointStateChanged() return ctrlResume, nil } @@ -636,6 +639,7 @@ func (t *Task) SetSavedSignalMask(mask linux.SignalSet) { // SignalStack returns the task-private signal stack. func (t *Task) SignalStack() arch.SignalStack { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) alt := t.signalStack if t.onSignalStack(alt) { alt.Flags |= arch.SignalStackFlagOnStack @@ -1050,6 +1054,8 @@ func (*runInterrupt) execute(t *Task) taskRunState { // Are there signals pending? if info := t.dequeueSignalLocked(t.signalMask); info != nil { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 { // Indicate that we've dequeued a stop signal before unlocking the // signal mutex; initiateGroupStop will check for races with diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 8485fb4b6..64c1e120a 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -102,10 +102,10 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { t, err := ts.newTask(cfg) if err != nil { cfg.TaskContext.release() - cfg.FSContext.DecRef() - cfg.FDTable.DecRef() + cfg.FSContext.DecRef(t) + cfg.FDTable.DecRef(t) if cfg.MountNamespaceVFS2 != nil { - cfg.MountNamespaceVFS2.DecRef() + cfg.MountNamespaceVFS2.DecRef(t) } return nil, err } diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go index 10c6e455c..296735d32 100644 --- a/pkg/sentry/kernel/task_stop.go +++ b/pkg/sentry/kernel/task_stop.go @@ -205,6 +205,22 @@ func (ts *TaskSet) BeginExternalStop() { } } +// PullFullState receives full states for all tasks. +func (ts *TaskSet) PullFullState() { + ts.mu.Lock() + defer ts.mu.Unlock() + if ts.Root == nil { + return + } + for t := range ts.Root.tids { + t.Activate() + if mm := t.MemoryManager(); mm != nil { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + } + t.Deactivate() + } +} + // EndExternalStop indicates the end of an external stop started by a previous // call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task // goroutines to resume. diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 4dfd2c990..0b34c0099 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -308,7 +308,7 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet { } // release releases the thread group's resources. -func (tg *ThreadGroup) release() { +func (tg *ThreadGroup) release(t *Task) { // Timers must be destroyed without holding the TaskSet or signal mutexes // since timers send signals with Timer.mu locked. tg.itimerRealTimer.Destroy() @@ -325,7 +325,7 @@ func (tg *ThreadGroup) release() { it.DestroyTimer() } if tg.mounts != nil { - tg.mounts.DecRef() + tg.mounts.DecRef(t) } } diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index ddeaff3db..20dd1cc21 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -281,7 +281,7 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr } defer func() { if mopts.MappingIdentity != nil { - mopts.MappingIdentity.DecRef() + mopts.MappingIdentity.DecRef(ctx) } }() if err := f.ConfigureMMap(ctx, &mopts); err != nil { @@ -663,7 +663,7 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err) return loadedELF{}, nil, err } - defer intFile.DecRef() + defer intFile.DecRef(ctx) interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin) if err != nil { diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 986c7fb4d..8d6802ea3 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -154,7 +154,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context return loadedELF{}, nil, nil, nil, err } // Ensure file is release in case the code loops or errors out. - defer args.File.DecRef() + defer args.File.DecRef(ctx) } else { if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil { return loadedELF{}, nil, nil, nil, err @@ -223,7 +223,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V if err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) } - defer file.DecRef() + defer file.DecRef(ctx) // Load the VDSO. vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded) @@ -292,7 +292,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V m.SetEnvvStart(sl.EnvvStart) m.SetEnvvEnd(sl.EnvvEnd) m.SetAuxv(auxv) - m.SetExecutable(file) + m.SetExecutable(ctx, file) symbolValue, err := getSymbolValueFromVDSO("rt_sigreturn") if err != nil { diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index c188f6c29..65d83096f 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -238,7 +238,7 @@ type MappingIdentity interface { IncRef() // DecRef decrements the MappingIdentity's reference count. - DecRef() + DecRef(ctx context.Context) // MappedName returns the application-visible name shown in // /proc/[pid]/maps. @@ -360,6 +360,13 @@ type MMapOpts struct { // // TODO(jamieliu): Replace entirely with MappingIdentity? Hint string + + // Force means to skip validation checks of Addr and Length. It can be + // used to create special mappings below mm.layout.MinAddr and + // mm.layout.MaxAddr. It has to be used with caution. + // + // If Force is true, Unmap and Fixed must be true. + Force bool } // File represents a host file that may be mapped into an platform.AddressSpace. diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 1999ec706..16fea53c4 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -258,8 +258,8 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { } // DecRef implements refs.RefCounter.DecRef. -func (m *aioMappable) DecRef() { - m.AtomicRefCount.DecRefWithDestructor(func() { +func (m *aioMappable) DecRef(ctx context.Context) { + m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { m.mfp.MemoryFile().DecRef(m.fr) }) } @@ -367,7 +367,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint if err != nil { return 0, err } - defer m.DecRef() + defer m.DecRef(ctx) addr, err := mm.MMap(ctx, memmap.MMapOpts{ Length: aioRingBufferSize, MappingIdentity: m, diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index aac56679b..09dbc06a4 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -57,6 +57,8 @@ func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arc // Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or // clone() (without CLONE_VM). func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { + mm.AddressSpace().PreFork() + defer mm.AddressSpace().PostFork() mm.metadataMu.Lock() defer mm.metadataMu.Unlock() mm.mappingMu.RLock() @@ -258,7 +260,7 @@ func (mm *MemoryManager) DecUsers(ctx context.Context) { mm.executable = nil mm.metadataMu.Unlock() if exe != nil { - exe.DecRef() + exe.DecRef(ctx) } mm.activeMu.Lock() diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go index 28e5057f7..0cfd60f6c 100644 --- a/pkg/sentry/mm/metadata.go +++ b/pkg/sentry/mm/metadata.go @@ -15,6 +15,7 @@ package mm import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/usermem" @@ -147,7 +148,7 @@ func (mm *MemoryManager) Executable() fsbridge.File { // SetExecutable sets the executable. // // This takes a reference on d. -func (mm *MemoryManager) SetExecutable(file fsbridge.File) { +func (mm *MemoryManager) SetExecutable(ctx context.Context, file fsbridge.File) { mm.metadataMu.Lock() // Grab a new reference. @@ -164,7 +165,7 @@ func (mm *MemoryManager) SetExecutable(file fsbridge.File) { // Do this without holding the lock, since it may wind up doing some // I/O to sync the dirent, etc. if orig != nil { - orig.DecRef() + orig.DecRef(ctx) } } diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 0e142fb11..4cdb52eb6 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -50,8 +50,8 @@ func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.F } // DecRef implements refs.RefCounter.DecRef. -func (m *SpecialMappable) DecRef() { - m.AtomicRefCount.DecRefWithDestructor(func() { +func (m *SpecialMappable) DecRef(ctx context.Context) { + m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { m.mfp.MemoryFile().DecRef(m.fr) }) } diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 3f496aa9f..e74d4e1c1 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -101,7 +101,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme if err != nil { return 0, err } - defer m.DecRef() + defer m.DecRef(ctx) opts.MappingIdentity = m opts.Mappable = m } @@ -1191,7 +1191,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length ui mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar)) mm.mappingMu.RUnlock() err := id.Msync(ctx, mr) - id.DecRef() + id.DecRef(ctx) if err != nil { return err } diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 16d8207e9..c4e1989ed 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -42,7 +42,12 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp Map32Bit: opts.Map32Bit, }) if err != nil { - return vmaIterator{}, usermem.AddrRange{}, err + // Can't force without opts.Unmap and opts.Fixed. + if opts.Force && opts.Unmap && opts.Fixed { + addr = opts.Addr + } else { + return vmaIterator{}, usermem.AddrRange{}, err + } } ar, _ := addr.ToRange(opts.Length) @@ -377,7 +382,7 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked()) } if vma.id != nil { - vma.id.DecRef() + vma.id.DecRef(ctx) } mm.usageAS -= uint64(vmaAR.Length()) if vma.isPrivateDataLocked() { @@ -446,7 +451,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa } if vma2.id != nil { - vma2.id.DecRef() + vma2.id.DecRef(context.Background()) } return vma1, true } diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index 3243d7214..46d3be58c 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -409,7 +409,6 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.File f.mappingsMu.Unlock() } - // Mark selected pages as in use. if f.opts.ManualZeroing { if err := f.forEachMappingSlice(fr, func(bs []byte) { for i := range bs { @@ -419,6 +418,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.File return memmap.FileRange{}, err } } + // Mark selected pages as in use. if !f.usage.Add(fr, usageInfo{ kind: kind, refs: 1, diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index b5d27a72a..3970dd81d 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -41,6 +41,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/atomicbitops", + "//pkg/context", "//pkg/cpuid", "//pkg/log", "//pkg/procid", diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index 98a3e539d..af5c5e191 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -248,3 +248,9 @@ func (as *addressSpace) Release() { // Drop all cached machine references. as.machine.dropPageTables(as.pageTables) } + +// PreFork implements platform.AddressSpace.PreFork. +func (as *addressSpace) PreFork() {} + +// PostFork implements platform.AddressSpace.PostFork. +func (as *addressSpace) PostFork() {} diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go index 6507121ea..eb92721fb 100644 --- a/pkg/sentry/platform/kvm/context.go +++ b/pkg/sentry/platform/kvm/context.go @@ -15,6 +15,7 @@ package kvm import ( + pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" @@ -37,7 +38,8 @@ type context struct { } // Switch runs the provided context in the given address space. -func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) { +func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) { + as := mm.AddressSpace() localAS := as.(*addressSpace) // Grab a vCPU. @@ -88,3 +90,9 @@ func (c *context) Interrupt() { // Release implements platform.Context.Release(). func (c *context) Release() {} + +// FloatingPointStateChanged implements platform.Context.FloatingPointStateChanged. +func (c *context) FloatingPointStateChanged() {} + +// PullFullState implements platform.Context.PullFullState. +func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {} diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go index 6f0539c29..fdc599477 100644 --- a/pkg/sentry/platform/kvm/kvm_const_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go @@ -116,6 +116,17 @@ const ( // Arm64: Exception Syndrome Register EL1. const ( + _ESR_ELx_EC_SHIFT = 26 + _ESR_ELx_EC_MASK = 0x3F << _ESR_ELx_EC_SHIFT + + _ESR_ELx_EC_IMP_DEF = 0x1f + _ESR_ELx_EC_IABT_LOW = 0x20 + _ESR_ELx_EC_IABT_CUR = 0x21 + _ESR_ELx_EC_PC_ALIGN = 0x22 + + _ESR_ELx_CM = 1 << 8 + _ESR_ELx_WNR = 1 << 6 + _ESR_ELx_FSC = 0x3F _ESR_SEGV_MAPERR_L0 = 0x4 diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index f3bf973de..9db171af9 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -125,71 +125,59 @@ func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.Acc return usermem.NoAccess, platform.ErrContextSignal } +// isInstructionAbort returns true if it is an instruction abort. +// +//go:nosplit +func isInstructionAbort(code uint64) bool { + value := (code & _ESR_ELx_EC_MASK) >> _ESR_ELx_EC_SHIFT + return value == _ESR_ELx_EC_IABT_LOW +} + +// isWriteFault returns whether it is a write fault. +// +//go:nosplit +func isWriteFault(code uint64) bool { + if isInstructionAbort(code) { + return false + } + + return (code & _ESR_ELx_WNR) != 0 +} + // fault generates an appropriate fault return. // //go:nosplit func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) { + bluepill(c) // Probably no-op, but may not be. faultAddr := c.GetFaultAddr() code, user := c.ErrorCode() + if !user { + // The last fault serviced by this CPU was not a user + // fault, so we can't reliably trust the faultAddr or + // the code provided here. We need to re-execute. + return usermem.NoAccess, platform.ErrContextInterrupt + } + // Reset the pointed SignalInfo. *info = arch.SignalInfo{Signo: signal} info.SetAddr(uint64(faultAddr)) - read := true - write := false - execute := true - ret := code & _ESR_ELx_FSC switch ret { case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3: info.Code = 1 //SEGV_MAPERR - read = false - write = true - execute = false case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3: info.Code = 2 // SEGV_ACCERR. - read = true - write = false - execute = false default: info.Code = 2 } - if !user { - read = true - write = false - execute = true - - } accessType := usermem.AccessType{ - Read: read, - Write: write, - Execute: execute, + Read: !isWriteFault(uint64(code)), + Write: isWriteFault(uint64(code)), + Execute: isInstructionAbort(uint64(code)), } return accessType, platform.ErrContextSignal } - -// retryInGuest runs the given function in guest mode. -// -// If the function does not complete in guest mode (due to execution of a -// system call due to a GC stall, for example), then it will be retried. The -// given function must be idempotent as a result of the retry mechanism. -func (m *machine) retryInGuest(fn func()) { - c := m.Get() - defer m.Put(c) - for { - c.ClearErrorCode() // See below. - bluepill(c) // Force guest mode. - fn() // Execute the given function. - _, user := c.ErrorCode() - if user { - // If user is set, then we haven't bailed back to host - // mode via a kernel exception or system call. We - // consider the full function to have executed in guest - // mode and we can return. - break - } - } -} diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go index ca902c8c1..4dad877ba 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go @@ -56,5 +56,9 @@ func CheckTestRegs(regs *arch.Registers, full bool) (err error) { err = addRegisterMismatch(err, fmt.Sprintf("R%d", i), regs.Regs[i], need) } } + // Check tls. + if need := ^uint64(11); regs.TPIDR_EL0 != need { + err = addRegisterMismatch(err, "tpdir_el0", regs.TPIDR_EL0, need) + } return } diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s index 07658144e..6caf7282d 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s @@ -52,6 +52,8 @@ start: TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 NO_LOCAL_POINTERS + // gc will touch fpsimd, so we should test it. + // such as in <runtime.deductSweepCredit>. FMOVD $(9.9), F0 MOVD $SYS_GETPID, R8 // getpid SVC @@ -102,11 +104,15 @@ isNaN: TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0 TWIDDLE_REGS() + MSR R10, TPIDR_EL0 + // Trapped in el0_svc. SVC RET // never reached TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0 - TWIDDLE_REGS() - // Branch to Register branches unconditionally to an address in <Rn>. - JMP (R4) // <=> br x4, must fault - RET // never reached + TWIDDLE_REGS() + MSR R10, TPIDR_EL0 + // Trapped in el0_ia. + // Branch to Register branches unconditionally to an address in <Rn>. + JMP (R6) // <=> br x6, must fault + RET // never reached diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index 4b13eec30..3f99afdd1 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -22,6 +22,7 @@ import ( "os" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -114,6 +115,17 @@ func (NoCPUPreemptionDetection) PreemptAllCPUs() error { panic("This platform does not support CPU preemption detection") } +// MemoryManager represents an abstraction above the platform address space +// which manages memory mappings and their contents. +type MemoryManager interface { + //usermem.IO provides access to the contents of a virtual memory space. + usermem.IO + // MMap establishes a memory mapping. + MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) + // AddressSpace returns the AddressSpace bound to mm. + AddressSpace() AddressSpace +} + // Context represents the execution context for a single thread. type Context interface { // Switch resumes execution of the thread specified by the arch.Context @@ -143,7 +155,30 @@ type Context interface { // concurrent call to Switch(). // // - ErrContextCPUPreempted: See the definition of that error for details. - Switch(as AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) + Switch(ctx context.Context, mm MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) + + // PullFullState() pulls a full state of the application thread. + // + // A platform can support lazy loading/restoring of a thread state + // which includes registers and a floating point state. + // + // For example, when the Sentry handles a system call, it may have only + // syscall arguments without other registers and a floating point + // state. And in this case, if the Sentry will need to construct a + // signal frame to call a signal handler, it will need to call + // PullFullState() to load all registers and FPU state. + // + // Preconditions: The caller must be running on the task goroutine. + PullFullState(as AddressSpace, ac arch.Context) + + // FloatingPointStateChanged forces restoring a full state of the application thread. + // + // A platform can support lazy loading/restoring of a thread state. + // This means that if the Sentry has not changed a thread state, + // the platform may not restore it. + // + // Preconditions: The caller must be running on the task goroutine. + FloatingPointStateChanged() // Interrupt interrupts a concurrent call to Switch(), causing it to return // ErrContextInterrupt. @@ -218,6 +253,13 @@ type AddressSpace interface { // must be acquired via platform.NewAddressSpace(). Release() + // PreFork() is called before creating a copy of AddressSpace. This + // guarantees that this address space will be in a consistent state. + PreFork() + + // PostFork() is called after creating a copy of AddressSpace. + PostFork() + // AddressSpaceIO methods are supported iff the associated platform's // Platform.SupportsAddressSpaceIO() == true. AddressSpaces for which this // does not hold may panic if AddressSpaceIO methods are invoked. diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 29fd23cc3..e04165fbf 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -24,6 +24,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", "//pkg/procid", "//pkg/safecopy", diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index 08d055e05..45ff2bcc3 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -48,6 +48,7 @@ import ( "os" "gvisor.dev/gvisor/pkg/abi/linux" + pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" @@ -95,7 +96,8 @@ type context struct { } // Switch runs the provided context in the given address space. -func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) { +func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) { + as := mm.AddressSpace() s := as.(*subprocess) isSyscall := s.switchToApp(c, ac) @@ -180,6 +182,12 @@ func (c *context) Interrupt() { // Release implements platform.Context.Release(). func (c *context) Release() {} +// FloatingPointStateChanged implements platform.Context.FloatingPointStateChanged. +func (c *context) FloatingPointStateChanged() {} + +// PullFullState implements platform.Context.PullFullState. +func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {} + // PTrace represents a collection of ptrace subprocesses. type PTrace struct { platform.MMapMinAddr diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index c990f3454..e1d54d8a2 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -662,3 +662,9 @@ func (s *subprocess) Unmap(addr usermem.Addr, length uint64) { panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err)) } } + +// PreFork implements platform.AddressSpace.PreFork. +func (s *subprocess) PreFork() {} + +// PostFork implements platform.AddressSpace.PostFork. +func (s *subprocess) PostFork() {} diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go index 5b98659b2..42009dac0 100644 --- a/pkg/sentry/platform/ring0/kernel_arm64.go +++ b/pkg/sentry/platform/ring0/kernel_arm64.go @@ -59,11 +59,13 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { regs.Pstate &= ^uint64(PsrFlagsClear) regs.Pstate |= UserFlagsSet + LoadFloatingPoint(switchOpts.FloatingPointState) SetTLS(regs.TPIDR_EL0) kernelExitToEl0() regs.TPIDR_EL0 = GetTLS() + SaveFloatingPoint(switchOpts.FloatingPointState) vector = c.vecCode diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go index a6345010d..00e52c8af 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.go +++ b/pkg/sentry/platform/ring0/lib_arm64.go @@ -38,6 +38,12 @@ func SaveVRegs(*byte) // LoadVRegs loads V0-V31 registers. func LoadVRegs(*byte) +// LoadFloatingPoint loads floating point state. +func LoadFloatingPoint(*byte) + +// SaveFloatingPoint saves floating point state. +func SaveFloatingPoint(*byte) + // GetTLS returns the value of TPIDR_EL0 register. func GetTLS() (value uint64) diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s index b63e14b41..86bfbe46f 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.s +++ b/pkg/sentry/platform/ring0/lib_arm64.s @@ -129,3 +129,89 @@ TEXT ·LoadVRegs(SB),NOSPLIT,$0-8 ISB $15 RET + +TEXT ·LoadFloatingPoint(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + MOVD 0(R0), R1 + MOVD R1, FPSR + MOVD 8(R0), R1 + MOVD R1, NZCV + + FMOVD 16*1(R0), F0 + FMOVD 16*2(R0), F1 + FMOVD 16*3(R0), F2 + FMOVD 16*4(R0), F3 + FMOVD 16*5(R0), F4 + FMOVD 16*6(R0), F5 + FMOVD 16*7(R0), F6 + FMOVD 16*8(R0), F7 + FMOVD 16*9(R0), F8 + FMOVD 16*10(R0), F9 + FMOVD 16*11(R0), F10 + FMOVD 16*12(R0), F11 + FMOVD 16*13(R0), F12 + FMOVD 16*14(R0), F13 + FMOVD 16*15(R0), F14 + FMOVD 16*16(R0), F15 + FMOVD 16*17(R0), F16 + FMOVD 16*18(R0), F17 + FMOVD 16*19(R0), F18 + FMOVD 16*20(R0), F19 + FMOVD 16*21(R0), F20 + FMOVD 16*22(R0), F21 + FMOVD 16*23(R0), F22 + FMOVD 16*24(R0), F23 + FMOVD 16*25(R0), F24 + FMOVD 16*26(R0), F25 + FMOVD 16*27(R0), F26 + FMOVD 16*28(R0), F27 + FMOVD 16*29(R0), F28 + FMOVD 16*30(R0), F29 + FMOVD 16*31(R0), F30 + FMOVD 16*32(R0), F31 + + RET + +TEXT ·SaveFloatingPoint(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + MOVD FPSR, R1 + MOVD R1, 0(R0) + MOVD NZCV, R1 + MOVD R1, 8(R0) + + FMOVD F0, 16*1(R0) + FMOVD F1, 16*2(R0) + FMOVD F2, 16*3(R0) + FMOVD F3, 16*4(R0) + FMOVD F4, 16*5(R0) + FMOVD F5, 16*6(R0) + FMOVD F6, 16*7(R0) + FMOVD F7, 16*8(R0) + FMOVD F8, 16*9(R0) + FMOVD F9, 16*10(R0) + FMOVD F10, 16*11(R0) + FMOVD F11, 16*12(R0) + FMOVD F12, 16*13(R0) + FMOVD F13, 16*14(R0) + FMOVD F14, 16*15(R0) + FMOVD F15, 16*16(R0) + FMOVD F16, 16*17(R0) + FMOVD F17, 16*18(R0) + FMOVD F18, 16*19(R0) + FMOVD F19, 16*20(R0) + FMOVD F20, 16*21(R0) + FMOVD F21, 16*22(R0) + FMOVD F22, 16*23(R0) + FMOVD F23, 16*24(R0) + FMOVD F24, 16*25(R0) + FMOVD F25, 16*26(R0) + FMOVD F26, 16*27(R0) + FMOVD F27, 16*28(R0) + FMOVD F28, 16*29(R0) + FMOVD F29, 16*30(R0) + FMOVD F30, 16*31(R0) + FMOVD F31, 16*32(R0) + + RET diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 8b439a078..70ccf77a7 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -68,7 +68,7 @@ func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { for _, fd := range fds { file := t.GetFile(fd) if file == nil { - files.Release() + files.Release(t) return nil, syserror.EBADF } files = append(files, file) @@ -100,9 +100,9 @@ func (fs *RightsFiles) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (fs *RightsFiles) Release() { +func (fs *RightsFiles) Release(ctx context.Context) { for _, f := range *fs { - f.DecRef() + f.DecRef(ctx) } *fs = nil } @@ -115,7 +115,7 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32 fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{ CloseOnExec: cloexec, }) - files[0].DecRef() + files[0].DecRef(t) files = files[1:] if err != nil { t.Warningf("Error inserting FD: %v", err) diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go index fd08179be..d9621968c 100644 --- a/pkg/sentry/socket/control/control_vfs2.go +++ b/pkg/sentry/socket/control/control_vfs2.go @@ -46,7 +46,7 @@ func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) { for _, fd := range fds { file := t.GetFileVFS2(fd) if file == nil { - files.Release() + files.Release(t) return nil, syserror.EBADF } files = append(files, file) @@ -78,9 +78,9 @@ func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (fs *RightsFilesVFS2) Release() { +func (fs *RightsFilesVFS2) Release(ctx context.Context) { for _, f := range *fs { - f.DecRef() + f.DecRef(ctx) } *fs = nil } @@ -93,7 +93,7 @@ func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int) fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{ CloseOnExec: cloexec, }) - files[0].DecRef() + files[0].DecRef(t) files = files[1:] if err != nil { t.Warningf("Error inserting FD: %v", err) diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 532a1ea5d..242e6bf76 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -100,12 +100,12 @@ func newSocketFile(ctx context.Context, family int, stype linux.SockType, protoc return nil, syserr.FromError(err) } dirent := socket.NewDirent(ctx, socketDevice) - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true, NonSeekable: true}, s), nil } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { +func (s *socketOpsCommon) Release(context.Context) { fdnotifier.RemoveFD(int32(s.fd)) syscall.Close(s.fd) } @@ -269,7 +269,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, syscall.Close(fd) return 0, nil, 0, err } - defer f.DecRef() + defer f.DecRef(t) kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{ CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, @@ -281,7 +281,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, syscall.Close(fd) return 0, nil, 0, err } - defer f.DecRef() + defer f.DecRef(t) kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{ CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index a48082631..fda3dcb35 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -53,6 +53,7 @@ type Stack struct { interfaceAddrs map[int32][]inet.InterfaceAddr routes []inet.Route supportsIPv6 bool + tcpRecovery inet.TCPLossRecovery tcpRecvBufSize inet.TCPBufferSize tcpSendBufSize inet.TCPBufferSize tcpSACKEnabled bool @@ -350,6 +351,16 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error { return syserror.EACCES } +// TCPRecovery implements inet.Stack.TCPRecovery. +func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) { + return s.tcpRecovery, nil +} + +// SetTCPRecovery implements inet.Stack.SetTCPRecovery. +func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error { + return syserror.EACCES +} + // getLine reads one line from proc file, with specified prefix. // The last argument, withHeader, specifies if it contains line header. func getLine(f *os.File, prefix string, withHeader bool) string { diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index 0d45e5053..31e374833 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -97,7 +97,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int } d := socket.NewDirent(t, netlinkSocketDevice) - defer d.DecRef() + defer d.DecRef(t) return fs.NewFile(t, d, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, s), nil } diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 98ca7add0..68a9b9a96 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -140,14 +140,14 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke // Bind the endpoint for good measure so we can connect to it. The // bound address will never be exposed. if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { - ep.Close() + ep.Close(t) return nil, err } // Create a connection from which the kernel can write messages. connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) if err != nil { - ep.Close() + ep.Close(t) return nil, err } @@ -164,9 +164,9 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { - s.connection.Release() - s.ep.Close() +func (s *socketOpsCommon) Release(ctx context.Context) { + s.connection.Release(ctx) + s.ep.Close(ctx) if s.bound { s.ports.Release(s.protocol.Protocol(), s.portID) @@ -621,7 +621,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys if len(bufs) > 0 { // RecvMsg never receives the address, so we don't need to send // one. - _, notify, err := s.connection.Send(bufs, cms, tcpip.FullAddress{}) + _, notify, err := s.connection.Send(ctx, bufs, cms, tcpip.FullAddress{}) // If the buffer is full, we simply drop messages, just like // Linux. if err != nil && err != syserr.ErrWouldBlock { @@ -648,7 +648,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys // Add the dump_done_errno payload. m.Put(int64(0)) - _, notify, err := s.connection.Send([][]byte{m.Finalize()}, cms, tcpip.FullAddress{}) + _, notify, err := s.connection.Send(ctx, [][]byte{m.Finalize()}, cms, tcpip.FullAddress{}) if err != nil && err != syserr.ErrWouldBlock { return err } diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index dbcd8b49a..a38d25da9 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -57,14 +57,14 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV // Bind the endpoint for good measure so we can connect to it. The // bound address will never be exposed. if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { - ep.Close() + ep.Close(t) return nil, err } // Create a connection from which the kernel can write messages. connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) if err != nil { - ep.Close() + ep.Close(t) return nil, err } diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index f86e6cd7a..e4846bc0b 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -330,7 +330,7 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue } dirent := socket.NewDirent(t, netstackDevice) - defer dirent.DecRef() + defer dirent.DecRef(t) return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{ socketOpsCommon: socketOpsCommon{ Queue: queue, @@ -479,7 +479,7 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error { } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { +func (s *socketOpsCommon) Release(context.Context) { s.Endpoint.Close() } @@ -854,7 +854,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { flags := ns.Flags() @@ -1490,6 +1490,10 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marsha vP := primitive.Int32(boolToInt32(v)) return &vP, nil + case linux.SO_ORIGINAL_DST: + // TODO(gvisor.dev/issue/170): ip6tables. + return nil, syserr.ErrInvalidArgument + default: emitUnimplementedEventIPv6(t, name) } @@ -1600,6 +1604,19 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in vP := primitive.Int32(boolToInt32(v)) return &vP, nil + case linux.SO_ORIGINAL_DST: + if outLen < int(binary.Size(linux.SockAddrInet{})) { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.OriginalDestinationOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress(v)) + return a.(*linux.SockAddrInet), nil + default: emitUnimplementedEventIP(t, name) } diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index a9025b0ec..3335e7430 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -169,7 +169,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { return 0, nil, 0, syserr.FromError(err) diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index 67737ae87..f0fe18684 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -207,6 +207,20 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error { return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError() } +// TCPRecovery implements inet.Stack.TCPRecovery. +func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) { + var recovery tcp.Recovery + if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil { + return 0, syserr.TranslateNetstackError(err).ToError() + } + return inet.TCPLossRecovery(recovery), nil +} + +// SetTCPRecovery implements inet.Stack.SetTCPRecovery. +func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error { + return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.Recovery(recovery))).ToError() +} + // Statistics implements inet.Stack.Statistics. func (s *Stack) Statistics(stat interface{}, arg string) error { switch stats := stat.(type) { diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index d112757fb..04b259d27 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -46,8 +46,8 @@ type ControlMessages struct { } // Release releases Unix domain socket credentials and rights. -func (c *ControlMessages) Release() { - c.Unix.Release() +func (c *ControlMessages) Release(ctx context.Context) { + c.Unix.Release(ctx) } // Socket is an interface combining fs.FileOperations and SocketOps, diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index a1e49cc57..c67b602f0 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -211,7 +211,7 @@ func (e *connectionedEndpoint) Listening() bool { // The socket will be a fresh state after a call to close and may be reused. // That is, close may be used to "unbind" or "disconnect" the socket in error // paths. -func (e *connectionedEndpoint) Close() { +func (e *connectionedEndpoint) Close(ctx context.Context) { e.Lock() var c ConnectedEndpoint var r Receiver @@ -233,7 +233,7 @@ func (e *connectionedEndpoint) Close() { case e.Listening(): close(e.acceptedChan) for n := range e.acceptedChan { - n.Close() + n.Close(ctx) } e.acceptedChan = nil e.path = "" @@ -241,11 +241,11 @@ func (e *connectionedEndpoint) Close() { e.Unlock() if c != nil { c.CloseNotify() - c.Release() + c.Release(ctx) } if r != nil { r.CloseNotify() - r.Release() + r.Release(ctx) } } @@ -340,7 +340,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn return nil default: // Busy; return ECONNREFUSED per spec. - ne.Close() + ne.Close(ctx) e.Unlock() ce.Unlock() return syserr.ErrConnectionRefused diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 4b06d63ac..70ee8f9b8 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -54,10 +54,10 @@ func (e *connectionlessEndpoint) isBound() bool { // Close puts the endpoint in a closed state and frees all resources associated // with it. -func (e *connectionlessEndpoint) Close() { +func (e *connectionlessEndpoint) Close(ctx context.Context) { e.Lock() if e.connected != nil { - e.connected.Release() + e.connected.Release(ctx) e.connected = nil } @@ -71,7 +71,7 @@ func (e *connectionlessEndpoint) Close() { e.Unlock() r.CloseNotify() - r.Release() + r.Release(ctx) } // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. @@ -108,10 +108,10 @@ func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c C if err != nil { return 0, syserr.ErrInvalidEndpointState } - defer connected.Release() + defer connected.Release(ctx) e.Lock() - n, notify, err := connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) + n, notify, err := connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) e.Unlock() if notify { @@ -135,7 +135,7 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi e.Lock() if e.connected != nil { - e.connected.Release() + e.connected.Release(ctx) } e.connected = connected e.Unlock() diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index d8f3ad63d..ef6043e19 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -15,6 +15,7 @@ package transport import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" @@ -57,10 +58,10 @@ func (q *queue) Close() { // Both the read and write queues must be notified after resetting: // q.ReaderQueue.Notify(waiter.EventIn) // q.WriterQueue.Notify(waiter.EventOut) -func (q *queue) Reset() { +func (q *queue) Reset(ctx context.Context) { q.mu.Lock() for cur := q.dataList.Front(); cur != nil; cur = cur.Next() { - cur.Release() + cur.Release(ctx) } q.dataList.Reset() q.used = 0 @@ -68,8 +69,8 @@ func (q *queue) Reset() { } // DecRef implements RefCounter.DecRef with destructor q.Reset. -func (q *queue) DecRef() { - q.DecRefWithDestructor(q.Reset) +func (q *queue) DecRef(ctx context.Context) { + q.DecRefWithDestructor(ctx, q.Reset) // We don't need to notify after resetting because no one cares about // this queue after all references have been dropped. } @@ -111,7 +112,7 @@ func (q *queue) IsWritable() bool { // // If notify is true, ReaderQueue.Notify must be called: // q.ReaderQueue.Notify(waiter.EventIn) -func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) { +func (q *queue) Enqueue(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) { q.mu.Lock() if q.closed { @@ -124,7 +125,7 @@ func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress } if discardEmpty && l == 0 { q.mu.Unlock() - c.Release() + c.Release(ctx) return 0, false, nil } diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 2f1b127df..475d7177e 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -37,7 +37,7 @@ type RightsControlMessage interface { Clone() RightsControlMessage // Release releases any resources owned by the RightsControlMessage. - Release() + Release(ctx context.Context) } // A CredentialsControlMessage is a control message containing Unix credentials. @@ -74,9 +74,9 @@ func (c *ControlMessages) Clone() ControlMessages { } // Release releases both the credentials and the rights. -func (c *ControlMessages) Release() { +func (c *ControlMessages) Release(ctx context.Context) { if c.Rights != nil { - c.Rights.Release() + c.Rights.Release(ctx) } *c = ControlMessages{} } @@ -90,7 +90,7 @@ type Endpoint interface { // Close puts the endpoint in a closed state and frees all resources // associated with it. - Close() + Close(ctx context.Context) // RecvMsg reads data and a control message from the endpoint. This method // does not block if there is no data pending. @@ -252,7 +252,7 @@ type BoundEndpoint interface { // Release releases any resources held by the BoundEndpoint. It must be // called before dropping all references to a BoundEndpoint returned by a // function. - Release() + Release(ctx context.Context) } // message represents a message passed over a Unix domain socket. @@ -281,8 +281,8 @@ func (m *message) Length() int64 { } // Release releases any resources held by the message. -func (m *message) Release() { - m.Control.Release() +func (m *message) Release(ctx context.Context) { + m.Control.Release(ctx) } // Peek returns a copy of the message. @@ -304,7 +304,7 @@ type Receiver interface { // See Endpoint.RecvMsg for documentation on shared arguments. // // notify indicates if RecvNotify should be called. - Recv(data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error) + Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error) // RecvNotify notifies the Receiver of a successful Recv. This must not be // called while holding any endpoint locks. @@ -333,7 +333,7 @@ type Receiver interface { // Release releases any resources owned by the Receiver. It should be // called before droping all references to a Receiver. - Release() + Release(ctx context.Context) } // queueReceiver implements Receiver for datagram sockets. @@ -344,7 +344,7 @@ type queueReceiver struct { } // Recv implements Receiver.Recv. -func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { var m *message var notify bool var err *syserr.Error @@ -398,8 +398,8 @@ func (q *queueReceiver) RecvMaxQueueSize() int64 { } // Release implements Receiver.Release. -func (q *queueReceiver) Release() { - q.readQueue.DecRef() +func (q *queueReceiver) Release(ctx context.Context) { + q.readQueue.DecRef(ctx) } // streamQueueReceiver implements Receiver for stream sockets. @@ -456,7 +456,7 @@ func (q *streamQueueReceiver) RecvMaxQueueSize() int64 { } // Recv implements Receiver.Recv. -func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { q.mu.Lock() defer q.mu.Unlock() @@ -502,7 +502,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, var cmTruncated bool if c.Rights != nil && numRights == 0 { - c.Rights.Release() + c.Rights.Release(ctx) c.Rights = nil cmTruncated = true } @@ -557,7 +557,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, // Consume rights. if numRights == 0 { cmTruncated = true - q.control.Rights.Release() + q.control.Rights.Release(ctx) } else { c.Rights = q.control.Rights haveRights = true @@ -582,7 +582,7 @@ type ConnectedEndpoint interface { // // syserr.ErrWouldBlock can be returned along with a partial write if // the caller should block to send the rest of the data. - Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error) + Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error) // SendNotify notifies the ConnectedEndpoint of a successful Send. This // must not be called while holding any endpoint locks. @@ -616,7 +616,7 @@ type ConnectedEndpoint interface { // Release releases any resources owned by the ConnectedEndpoint. It should // be called before droping all references to a ConnectedEndpoint. - Release() + Release(ctx context.Context) // CloseUnread sets the fact that this end is closed with unread data to // the peer socket. @@ -654,7 +654,7 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) } // Send implements ConnectedEndpoint.Send. -func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { discardEmpty := false truncate := false if e.endpoint.Type() == linux.SOCK_STREAM { @@ -669,7 +669,7 @@ func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.Fu truncate = true } - return e.writeQueue.Enqueue(data, c, from, discardEmpty, truncate) + return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate) } // SendNotify implements ConnectedEndpoint.SendNotify. @@ -707,8 +707,8 @@ func (e *connectedEndpoint) SendMaxQueueSize() int64 { } // Release implements ConnectedEndpoint.Release. -func (e *connectedEndpoint) Release() { - e.writeQueue.DecRef() +func (e *connectedEndpoint) Release(ctx context.Context) { + e.writeQueue.DecRef(ctx) } // CloseUnread implements ConnectedEndpoint.CloseUnread. @@ -798,7 +798,7 @@ func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, n return 0, 0, ControlMessages{}, false, syserr.ErrNotConnected } - recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(data, creds, numRights, peek) + recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(ctx, data, creds, numRights, peek) e.Unlock() if err != nil { return 0, 0, ControlMessages{}, false, err @@ -827,7 +827,7 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess return 0, syserr.ErrAlreadyConnected } - n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) + n, notify, err := e.connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) e.Unlock() if notify { @@ -1001,6 +1001,6 @@ func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { } // Release implements BoundEndpoint.Release. -func (*baseEndpoint) Release() { +func (*baseEndpoint) Release(context.Context) { // Binding a baseEndpoint doesn't take a reference. } diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 0482d33cf..2b8454edb 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -62,7 +62,7 @@ type SocketOperations struct { // New creates a new unix socket. func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File { dirent := socket.NewDirent(ctx, unixSocketDevice) - defer dirent.DecRef() + defer dirent.DecRef(ctx) return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true, NonSeekable: true}) } @@ -97,17 +97,17 @@ type socketOpsCommon struct { } // DecRef implements RefCounter.DecRef. -func (s *socketOpsCommon) DecRef() { - s.DecRefWithDestructor(func() { - s.ep.Close() +func (s *socketOpsCommon) DecRef(ctx context.Context) { + s.DecRefWithDestructor(ctx, func(context.Context) { + s.ep.Close(ctx) }) } // Release implemements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { +func (s *socketOpsCommon) Release(ctx context.Context) { // Release only decrements a reference on s because s may be referenced in // the abstract socket namespace. - s.DecRef() + s.DecRef(ctx) } func (s *socketOpsCommon) isPacket() bool { @@ -234,7 +234,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } ns := New(t, ep, s.stype) - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { flags := ns.Flags() @@ -284,7 +284,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } @@ -294,7 +294,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { var name string cwd := t.FSContext().WorkingDirectory() - defer cwd.DecRef() + defer cwd.DecRef(t) // Is there no slash at all? if !strings.Contains(p, "/") { @@ -302,7 +302,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { name = p } else { root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) // Find the last path component, we know that something follows // that final slash, otherwise extractPath() would have failed. lastSlash := strings.LastIndex(p, "/") @@ -318,7 +318,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // No path available. return syserr.ErrNoSuchFile } - defer d.DecRef() + defer d.DecRef(t) name = p[lastSlash+1:] } @@ -332,7 +332,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if err != nil { return syserr.ErrPortInUse } - childDir.DecRef() + childDir.DecRef(t) } return nil @@ -378,9 +378,9 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, FollowFinalSymlink: true, } ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path}) - root.DecRef() + root.DecRef(t) if relPath { - start.DecRef() + start.DecRef(t) } if e != nil { return nil, syserr.FromError(e) @@ -393,15 +393,15 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, cwd := t.FSContext().WorkingDirectory() remainingTraversals := uint(fs.DefaultTraversalLimit) d, e := t.MountNamespace().FindInode(t, root, cwd, path, &remainingTraversals) - cwd.DecRef() - root.DecRef() + cwd.DecRef(t) + root.DecRef(t) if e != nil { return nil, syserr.FromError(e) } // Extract the endpoint if one is there. ep := d.Inode.BoundEndpoint(path) - d.DecRef() + d.DecRef(t) if ep == nil { // No socket! return nil, syserr.ErrConnectionRefused @@ -415,7 +415,7 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool if err != nil { return err } - defer ep.Release() + defer ep.Release(t) // Connect the server endpoint. err = s.ep.Connect(t, ep) @@ -473,7 +473,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b if err != nil { return 0, err } - defer ep.Release() + defer ep.Release(t) w.To = ep if ep.Passcred() && w.Control.Credentials == nil { diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 05c16fcfe..dfa25241a 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -136,7 +136,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK) @@ -183,19 +183,19 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } } else { path := fspath.Parse(p) root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root relPath := !path.Absolute if relPath { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } pop := vfs.PathOperation{ Root: root, @@ -333,7 +333,7 @@ func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) f, err := NewSockfsFile(t, ep, stype) if err != nil { - ep.Close() + ep.Close(t) return nil, err } return f, nil @@ -357,14 +357,14 @@ func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (* ep1, ep2 := transport.NewPair(t, stype, t.Kernel()) s1, err := NewSockfsFile(t, ep1, stype) if err != nil { - ep1.Close() - ep2.Close() + ep1.Close(t) + ep2.Close(t) return nil, nil, err } s2, err := NewSockfsFile(t, ep2, stype) if err != nil { - s1.DecRef() - ep2.Close() + s1.DecRef(t) + ep2.Close(t) return nil, nil, err } diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go index 9eb626b76..a06c9b8ab 100644 --- a/pkg/sentry/state/state.go +++ b/pkg/sentry/state/state.go @@ -60,6 +60,7 @@ type SaveOpts struct { func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error { log.Infof("Sandbox save started, pausing all tasks.") k.Pause() + k.ReceiveTaskStates() defer k.Unpause() defer log.Infof("Tasks resumed after save.") diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index c0512de89..b51c4c941 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -521,6 +521,7 @@ var sockOptNames = map[uint64]abi.ValueSet{ linux.IP_ROUTER_ALERT: "IP_ROUTER_ALERT", linux.IP_PKTOPTIONS: "IP_PKTOPTIONS", linux.IP_MTU: "IP_MTU", + linux.SO_ORIGINAL_DST: "SO_ORIGINAL_DST", }, linux.SOL_SOCKET: { linux.SO_ERROR: "SO_ERROR", diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 68ca537c8..87b239730 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -147,14 +147,14 @@ func fd(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectory() if root != nil { - defer root.DecRef() + defer root.DecRef(t) } if fd == linux.AT_FDCWD { wd := t.FSContext().WorkingDirectory() var name string if wd != nil { - defer wd.DecRef() + defer wd.DecRef(t) name, _ = wd.FullName(root) } else { name = "(unknown cwd)" @@ -167,7 +167,7 @@ func fd(t *kernel.Task, fd int32) string { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) } - defer file.DecRef() + defer file.DecRef(t) name, _ := file.Dirent.FullName(root) return fmt.Sprintf("%#x %s", fd, name) @@ -175,12 +175,12 @@ func fd(t *kernel.Task, fd int32) string { func fdVFS2(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) vfsObj := root.Mount().Filesystem().VirtualFilesystem() if fd == linux.AT_FDCWD { wd := t.FSContext().WorkingDirectoryVFS2() - defer wd.DecRef() + defer wd.DecRef(t) name, _ := vfsObj.PathnameWithDeleted(t, root, wd) return fmt.Sprintf("AT_FDCWD %s", name) @@ -191,7 +191,7 @@ func fdVFS2(t *kernel.Task, fd int32) string { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) } - defer file.DecRef() + defer file.DecRef(t) name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry()) return fmt.Sprintf("%#x %s", fd, name) diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index d9fb808c0..d23a0068a 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -28,7 +28,7 @@ import ( // CreateEpoll implements the epoll_create(2) linux syscall. func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) { file := epoll.NewEventPoll(t) - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: closeOnExec, @@ -47,14 +47,14 @@ func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -73,14 +73,14 @@ func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, m if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -99,14 +99,14 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -115,7 +115,7 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { } // Try to remove the entry. - return e.RemoveEntry(epoll.FileIdentifier{file, fd}) + return e.RemoveEntry(t, epoll.FileIdentifier{file, fd}) } // WaitEpoll implements the epoll_wait(2) linux syscall. @@ -125,7 +125,7 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEve if epollfile == nil { return nil, syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index ba2557c52..e9d64dec5 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -247,7 +247,7 @@ func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linu ev.Result = -int64(kernel.ExtractErrno(err, 0)) } - file.DecRef() + file.DecRef(ctx) // Queue the result for delivery. actx.FinishRequest(ev) @@ -257,7 +257,7 @@ func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linu // wake up. if eventFile != nil { eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) - eventFile.DecRef() + eventFile.DecRef(ctx) } } } @@ -269,7 +269,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user // File not found. return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Was there an eventFD? Extract it. var eventFile *fs.File @@ -279,7 +279,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user // Bad FD. return syserror.EBADF } - defer eventFile.DecRef() + defer eventFile.DecRef(t) // Check that it is an eventfd. if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok { diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index ed3413ca6..3b4f879e4 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -37,7 +37,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc event.SetFlags(fs.SettableFileFlags{ NonBlocking: flags&linux.EFD_NONBLOCK != 0, }) - defer event.DecRef() + defer event.DecRef(t) fd, err := t.NewFDFrom(0, event, kernel.FDFlags{ CloseOnExec: flags&linux.EFD_CLOEXEC != 0, diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 2797c6a72..1bc9b184e 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -40,7 +40,7 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, // Common case: we are accessing a file in the root. root := t.FSContext().RootDirectory() err := fn(root, root, name, linux.MaxSymlinkTraversals) - root.DecRef() + root.DecRef(t) return err } else if dir == "." && dirFD == linux.AT_FDCWD { // Common case: we are accessing a file relative to the current @@ -48,8 +48,8 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, wd := t.FSContext().WorkingDirectory() root := t.FSContext().RootDirectory() err := fn(root, wd, name, linux.MaxSymlinkTraversals) - wd.DecRef() - root.DecRef() + wd.DecRef(t) + root.DecRef(t) return err } @@ -97,19 +97,19 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro } else { d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals) } - root.DecRef() + root.DecRef(t) if wd != nil { - wd.DecRef() + wd.DecRef(t) } if f != nil { - f.DecRef() + f.DecRef(t) } if err != nil { return err } err = fn(root, d, remainingTraversals) - d.DecRef() + d.DecRef(t) return err } @@ -186,7 +186,7 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint if err != nil { return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } - defer file.DecRef() + defer file.DecRef(t) // Success. newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ @@ -242,7 +242,7 @@ func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode if err != nil { return err } - file.DecRef() + file.DecRef(t) return nil case linux.ModeNamedPipe: @@ -332,7 +332,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l if err != nil { break } - defer found.DecRef() + defer found.DecRef(t) // We found something (possibly a symlink). If the // O_EXCL flag was passed, then we can immediately @@ -357,7 +357,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l resolved, err = found.Inode.Getlink(t) if err == nil { // No more resolution necessary. - defer resolved.DecRef() + defer resolved.DecRef(t) break } if err != fs.ErrResolveViaReadlink { @@ -384,7 +384,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l if err != nil { break } - defer newParent.DecRef() + defer newParent.DecRef(t) // Repeat the process with the parent and name of the // symlink target. @@ -416,7 +416,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l if err != nil { return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } - defer newFile.DecRef() + defer newFile.DecRef(t) case syserror.ENOENT: // File does not exist. Proceed with creation. @@ -432,7 +432,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l // No luck, bail. return err } - defer newFile.DecRef() + defer newFile.DecRef(t) found = newFile.Dirent default: return err @@ -596,7 +596,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Shared flags between file and socket. switch request { @@ -671,9 +671,9 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal addr := args[0].Pointer() size := args[1].SizeT() cwd := t.FSContext().WorkingDirectory() - defer cwd.DecRef() + defer cwd.DecRef(t) root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) // Get our fullname from the root and preprend unreachable if the root was // unreachable from our current dirent this is the same behavior as on linux. @@ -722,7 +722,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return err } - t.FSContext().SetRootDirectory(d) + t.FSContext().SetRootDirectory(t, d) return nil }) } @@ -747,7 +747,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return err } - t.FSContext().SetWorkingDirectory(d) + t.FSContext().SetWorkingDirectory(t, d) return nil }) } @@ -760,7 +760,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is it a directory? if !fs.IsDir(file.Dirent.Inode.StableAttr) { @@ -772,7 +772,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } - t.FSContext().SetWorkingDirectory(file.Dirent) + t.FSContext().SetWorkingDirectory(t, file.Dirent) return 0, nil, nil } @@ -791,7 +791,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Flush(t) return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file) @@ -805,7 +805,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) if err != nil { @@ -826,7 +826,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if oldFile == nil { return 0, nil, syserror.EBADF } - defer oldFile.DecRef() + defer oldFile.DecRef(t) return uintptr(newfd), nil, nil } @@ -850,7 +850,7 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if oldFile == nil { return 0, nil, syserror.EBADF } - defer oldFile.DecRef() + defer oldFile.DecRef(t) err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}) if err != nil { @@ -925,7 +925,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: @@ -1057,7 +1057,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_SETOWN_EX: addr := args[2].Pointer() var owner linux.FOwnerEx - n, err := t.CopyIn(addr, &owner) + _, err := t.CopyIn(addr, &owner) if err != nil { return 0, nil, err } @@ -1069,21 +1069,21 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, syserror.ESRCH } a.SetOwnerTask(t, task) - return uintptr(n), nil, nil + return 0, nil, nil case linux.F_OWNER_PID: tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID)) if tg == nil { return 0, nil, syserror.ESRCH } a.SetOwnerThreadGroup(t, tg) - return uintptr(n), nil, nil + return 0, nil, nil case linux.F_OWNER_PGRP: pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID)) if pg == nil { return 0, nil, syserror.ESRCH } a.SetOwnerProcessGroup(t, pg) - return uintptr(n), nil, nil + return 0, nil, nil default: return 0, nil, syserror.EINVAL } @@ -1132,7 +1132,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // If the FD refers to a pipe or FIFO, return error. if fs.IsPipe(file.Dirent.Inode.StableAttr) { @@ -1171,7 +1171,7 @@ func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode switch err { case nil: // The directory existed. - defer f.DecRef() + defer f.DecRef(t) return syserror.EEXIST case syserror.EACCES: // Permission denied while walking to the directory. @@ -1349,7 +1349,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32 if target == nil { return syserror.EBADF } - defer target.DecRef() + defer target.DecRef(t) if err := mayLinkAt(t, target.Dirent.Inode); err != nil { return err } @@ -1602,7 +1602,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Reject truncation if the file flags do not permit this operation. // This is different from truncate(2) above. @@ -1730,7 +1730,7 @@ func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bo if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return chown(t, file.Dirent, uid, gid) } @@ -1768,7 +1768,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, chown(t, file.Dirent, uid, gid) } @@ -1833,7 +1833,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, chmod(t, file.Dirent, mode) } @@ -1893,10 +1893,10 @@ func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, reso if f == nil { return syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) return setTimestamp(root, f.Dirent, linux.MaxSymlinkTraversals) } @@ -2088,7 +2088,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if offset < 0 || length <= 0 { return 0, nil, syserror.EINVAL @@ -2141,7 +2141,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // flock(2): EBADF fd is not an open file descriptor. return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) nonblocking := operation&linux.LOCK_NB != 0 operation &^= linux.LOCK_NB @@ -2224,8 +2224,8 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, err } - defer dirent.DecRef() - defer file.DecRef() + defer dirent.DecRef(t) + defer file.DecRef(t) newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: cloExec, diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index f04d78856..9d1b2edb1 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -73,7 +73,7 @@ func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, fo err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts)) } - t.Futex().WaitComplete(w) + t.Futex().WaitComplete(w, t) return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS) } @@ -95,7 +95,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add } remaining, err := t.BlockWithTimeout(w.C, !forever, duration) - t.Futex().WaitComplete(w) + t.Futex().WaitComplete(w, t) if err == nil { return 0, nil } @@ -148,7 +148,7 @@ func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.A timer.Destroy() } - t.Futex().WaitComplete(w) + t.Futex().WaitComplete(w, t) return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index b126fecc0..f5699e55d 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -68,7 +68,7 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir if dir == nil { return 0, syserror.EBADF } - defer dir.DecRef() + defer dir.DecRef(t) w := &usermem.IOReadWriter{ Ctx: t, diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index b2c7b3444..cf47bb9dd 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -40,7 +40,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. NonBlocking: flags&linux.IN_NONBLOCK != 0, } n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t)) - defer n.DecRef() + defer n.DecRef(t) fd, err := t.NewFDFrom(0, n, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, @@ -71,7 +71,7 @@ func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { ino, ok := file.FileOperations.(*fs.Inotify) if !ok { // Not an inotify fd. - file.DecRef() + file.DecRef(t) return nil, nil, syserror.EINVAL } @@ -98,7 +98,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -128,6 +128,6 @@ func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if err != nil { return 0, nil, err } - defer file.DecRef() - return 0, nil, ino.RmWatch(wd) + defer file.DecRef(t) + return 0, nil, ino.RmWatch(t, wd) } diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index 3f7691eae..1c38f8f4f 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -33,7 +33,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) var sw fs.SeekWhence switch whence { diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 91694d374..72786b032 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -75,7 +75,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC } defer func() { if opts.MappingIdentity != nil { - opts.MappingIdentity.DecRef() + opts.MappingIdentity.DecRef(t) } }() @@ -85,7 +85,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) flags := file.Flags() // mmap unconditionally requires that the FD is readable. diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go index eb5ff48f5..bd0633564 100644 --- a/pkg/sentry/syscalls/linux/sys_mount.go +++ b/pkg/sentry/syscalls/linux/sys_mount.go @@ -115,7 +115,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall }); err != nil { // Something went wrong. Drop our ref on rootInode before // returning the error. - rootInode.DecRef() + rootInode.DecRef(t) return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 43c510930..3149e4aad 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -34,10 +34,10 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize) r.SetFlags(linuxToFlags(flags).Settable()) - defer r.DecRef() + defer r.DecRef(t) w.SetFlags(linuxToFlags(flags).Settable()) - defer w.DecRef() + defer w.DecRef(t) fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -49,7 +49,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { if _, err := t.CopyOut(addr, fds); err != nil { for _, fd := range fds { if file, _ := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return 0, err diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index f0198141c..3435bdf77 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -70,7 +70,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } if ch == nil { - defer file.DecRef() + defer file.DecRef(t) } else { state.file = file state.waiter, _ = waiter.NewChannelEntry(ch) @@ -82,11 +82,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } // releaseState releases all the pollState in "state". -func releaseState(state []pollState) { +func releaseState(t *kernel.Task, state []pollState) { for i := range state { if state[i].file != nil { state[i].file.EventUnregister(&state[i].waiter) - state[i].file.DecRef() + state[i].file.DecRef(t) } } } @@ -107,7 +107,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // result, we stop registering for events but still go through all files // to get their ready masks. state := make([]pollState, len(pfd)) - defer releaseState(state) + defer releaseState(t, state) n := uintptr(0) for i := range pfd { initReadiness(t, &pfd[i], &state[i], ch) @@ -266,7 +266,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add if file == nil { return 0, syserror.EBADF } - file.DecRef() + file.DecRef(t) var mask int16 if (rV & m) != 0 { diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index f92bf8096..64a725296 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -128,7 +128,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // They trying to set exe to a non-file? if !fs.IsFile(file.Dirent.Inode.StableAttr) { @@ -136,7 +136,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } // Set the underlying executable. - t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file)) + t.MemoryManager().SetExecutable(t, fsbridge.NewFSFile(file)) case linux.PR_SET_MM_AUXV, linux.PR_SET_MM_START_CODE, diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index 071b4bacc..3bbc3fa4b 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -48,7 +48,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -84,7 +84,7 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -118,7 +118,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -164,7 +164,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -195,7 +195,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -244,7 +244,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go index 4a8bc24a2..f0ae8fa8e 100644 --- a/pkg/sentry/syscalls/linux/sys_shm.go +++ b/pkg/sentry/syscalls/linux/sys_shm.go @@ -39,7 +39,7 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer segment.DecRef() + defer segment.DecRef(t) return uintptr(segment.ID), nil, nil } @@ -66,7 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{ Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC, @@ -108,7 +108,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) stat, err := segment.IPCStat(t) if err == nil { @@ -132,7 +132,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) switch cmd { case linux.IPC_SET: @@ -145,7 +145,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err case linux.IPC_RMID: - segment.MarkDestroyed() + segment.MarkDestroyed(t) return 0, nil, nil case linux.SHM_LOCK, linux.SHM_UNLOCK: diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index d2b0012ae..20cb1a5cb 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -536,7 +536,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is this a signalfd? if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok { @@ -553,7 +553,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) // Set appropriate flags. file.SetFlags(fs.SettableFileFlags{ diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 414fce8e3..fec1c1974 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -200,7 +200,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal s.SetFlags(fs.SettableFileFlags{ NonBlocking: stype&linux.SOCK_NONBLOCK != 0, }) - defer s.DecRef() + defer s.DecRef(t) fd, err := t.NewFDFrom(0, s, kernel.FDFlags{ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, @@ -235,8 +235,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } s1.SetFlags(fileFlags) s2.SetFlags(fileFlags) - defer s1.DecRef() - defer s2.DecRef() + defer s1.DecRef(t) + defer s2.DecRef(t) // Create the FDs for the sockets. fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{ @@ -250,7 +250,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if _, err := t.CopyOut(socks, fds); err != nil { for _, fd := range fds { if file, _ := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return 0, nil, err @@ -270,7 +270,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -301,7 +301,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -360,7 +360,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -387,7 +387,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -416,7 +416,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -447,7 +447,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -529,7 +529,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -567,7 +567,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -595,7 +595,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -628,7 +628,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -681,7 +681,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -775,7 +775,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC - cms.Release() + cms.Release(t) } if int(msg.Flags) != mflags { @@ -795,7 +795,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } - defer cms.Release() + defer cms.Release(t) controlData := make([]byte, 0, msg.ControlLen) controlData = control.PackControlMessages(t, cms, controlData) @@ -851,7 +851,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -880,7 +880,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag } n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) - cm.Release() + cm.Release(t) if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } @@ -924,7 +924,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -962,7 +962,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -1066,7 +1066,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) if err != nil { - controlMessages.Release() + controlMessages.Release(t) } return uintptr(n), err } @@ -1084,7 +1084,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 77c78889d..b8846a10a 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -101,7 +101,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) if !inFile.Flags().Read { return 0, nil, syserror.EBADF @@ -111,7 +111,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) if !outFile.Flags().Write { return 0, nil, syserror.EBADF @@ -192,13 +192,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) // The operation is non-blocking if anything is non-blocking. // @@ -300,13 +300,13 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) // All files must be pipes. if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) { diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 46ebf27a2..a5826f2dd 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -58,7 +58,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, fstat(t, file, statAddr) } @@ -100,7 +100,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, fstat(t, file, statAddr) } @@ -158,7 +158,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) uattr, err := file.UnstableAttr(t) if err != nil { return 0, nil, err @@ -249,7 +249,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, statfsImpl(t, file.Dirent, statfsAddr) } diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 5ad465ae3..f2c0e5069 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -39,7 +39,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Use "sync-the-world" for now, it's guaranteed that fd is at least // on the root filesystem. @@ -54,7 +54,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll) return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) @@ -70,7 +70,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData) return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) @@ -103,7 +103,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // SYNC_FILE_RANGE_WAIT_BEFORE waits upon write-out of all pages in the // specified range that have already been submitted to the device diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 00915fdde..2d16e4933 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -117,7 +117,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0 root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) var wd *fs.Dirent var executable fsbridge.File @@ -133,7 +133,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) closeOnExec = fdFlags.CloseOnExec if atEmptyPath && len(pathname) == 0 { @@ -155,7 +155,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user } } if wd != nil { - defer wd.DecRef() + defer wd.DecRef(t) } // Load the new TaskContext. diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go index cf49b43db..34b03e4ee 100644 --- a/pkg/sentry/syscalls/linux/sys_timerfd.go +++ b/pkg/sentry/syscalls/linux/sys_timerfd.go @@ -43,7 +43,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } f := timerfd.NewFile(t, c) - defer f.DecRef() + defer f.DecRef(t) f.SetFlags(fs.SettableFileFlags{ NonBlocking: flags&linux.TFD_NONBLOCK != 0, }) @@ -73,7 +73,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { @@ -107,7 +107,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index 6ec0de96e..485526e28 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -48,7 +48,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { @@ -85,7 +85,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -131,7 +131,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { @@ -162,7 +162,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -215,7 +215,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index c24946160..97474fd3c 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -49,7 +49,7 @@ func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size) if err != nil { @@ -153,7 +153,7 @@ func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags) } @@ -270,7 +270,7 @@ func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) n, err := listXattr(t, f.Dirent, listAddr, size) if err != nil { @@ -384,7 +384,7 @@ func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) return 0, nil, removeXattr(t, f.Dirent, nameAddr) } diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go index e5cdefc50..399b4f60c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/aio.go +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -88,7 +88,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user if fd == nil { return syserror.EBADF } - defer fd.DecRef() + defer fd.DecRef(t) // Was there an eventFD? Extract it. var eventFD *vfs.FileDescription @@ -97,7 +97,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user if eventFD == nil { return syserror.EBADF } - defer eventFD.DecRef() + defer eventFD.DecRef(t) // Check that it is an eventfd. if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { @@ -169,7 +169,7 @@ func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr use ev.Result = -int64(kernel.ExtractErrno(err, 0)) } - fd.DecRef() + fd.DecRef(ctx) // Queue the result for delivery. aioCtx.FinishRequest(ev) @@ -179,7 +179,7 @@ func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr use // wake up. if eventFD != nil { eventFD.Impl().(*eventfd.EventFileDescription).Signal(1) - eventFD.DecRef() + eventFD.DecRef(ctx) } } } diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go index 34c90ae3e..c62f03509 100644 --- a/pkg/sentry/syscalls/linux/vfs2/epoll.go +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -37,11 +37,11 @@ func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. return 0, nil, syserror.EINVAL } - file, err := t.Kernel().VFS().NewEpollInstanceFD() + file, err := t.Kernel().VFS().NewEpollInstanceFD(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0, @@ -62,11 +62,11 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, syserror.EINVAL } - file, err := t.Kernel().VFS().NewEpollInstanceFD() + file, err := t.Kernel().VFS().NewEpollInstanceFD(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) if err != nil { @@ -86,7 +86,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if epfile == nil { return 0, nil, syserror.EBADF } - defer epfile.DecRef() + defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { return 0, nil, syserror.EINVAL @@ -95,7 +95,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if epfile == file { return 0, nil, syserror.EINVAL } @@ -135,7 +135,7 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if epfile == nil { return 0, nil, syserror.EBADF } - defer epfile.DecRef() + defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go index aff1a2070..807f909da 100644 --- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -38,11 +38,11 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc fileFlags |= linux.O_NONBLOCK } semMode := flags&linux.EFD_SEMAPHORE != 0 - eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags) + eventfd, err := eventfd.New(t, vfsObj, initVal, semMode, fileFlags) if err != nil { return 0, nil, err } - defer eventfd.DecRef() + defer eventfd.DecRef(t) fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{ CloseOnExec: flags&linux.EFD_CLOEXEC != 0, diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go index aef0078a8..066ee0863 100644 --- a/pkg/sentry/syscalls/linux/vfs2/execve.go +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -71,7 +71,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) var executable fsbridge.File closeOnExec := false if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute { @@ -90,7 +90,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user } start := dirfile.VirtualDentry() start.IncRef() - dirfile.DecRef() + dirfile.DecRef(t) closeOnExec = dirfileFlags.CloseOnExec file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{ Root: root, @@ -101,19 +101,19 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user Flags: linux.O_RDONLY, FileExec: true, }) - start.DecRef() + start.DecRef(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) executable = fsbridge.NewVFSFile(file) } // Load the new TaskContext. mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change - defer mntns.DecRef() + defer mntns.DecRef(t) wd := t.FSContext().WorkingDirectoryVFS2() - defer wd.DecRef() + defer wd.DecRef(t) remainingTraversals := uint(linux.MaxSymlinkTraversals) loadArgs := loader.LoadArgs{ Opener: fsbridge.NewVFSLookup(mntns, root, wd), diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 517394ba9..4856554fe 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -38,7 +38,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.OnClose(t) return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file) @@ -52,7 +52,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) if err != nil { @@ -72,7 +72,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - file.DecRef() + file.DecRef(t) return uintptr(newfd), nil, nil } @@ -101,7 +101,7 @@ func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -121,7 +121,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: @@ -185,11 +185,11 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err case linux.F_SETOWN_EX: var owner linux.FOwnerEx - n, err := t.CopyIn(args[2].Pointer(), &owner) + _, err := t.CopyIn(args[2].Pointer(), &owner) if err != nil { return 0, nil, err } - return uintptr(n), nil, setAsyncOwner(t, file, owner.Type, owner.PID) + return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID) case linux.F_GETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { @@ -208,7 +208,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_SETLK, linux.F_SETLKW: return 0, nil, posixLock(t, args, file, cmd) default: - // TODO(gvisor.dev/issue/2920): Everything else is not yet supported. + // Everything else is not yet supported. return 0, nil, syserror.EINVAL } } @@ -332,7 +332,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // If the FD refers to a pipe or FIFO, return error. if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index 6b14c2bef..01e0f9010 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -18,7 +18,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -57,7 +56,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i if err != nil { return err } - defer oldtpop.Release() + defer oldtpop.Release(t) newpath, err := copyInPath(t, newpathAddr) if err != nil { @@ -67,7 +66,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i if err != nil { return err } - defer newtpop.Release() + defer newtpop.Release(t) return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop) } @@ -96,7 +95,7 @@ func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{ Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()), }) @@ -128,7 +127,7 @@ func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) // "Zero file type is equivalent to type S_IFREG." - mknod(2) if mode.FileType() == 0 { @@ -175,7 +174,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{ Flags: flags | linux.O_LARGEFILE, @@ -184,7 +183,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -228,7 +227,7 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd if err != nil { return err } - defer oldtpop.Release() + defer oldtpop.Release(t) newpath, err := copyInPath(t, newpathAddr) if err != nil { @@ -238,62 +237,13 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd if err != nil { return err } - defer newtpop.Release() + defer newtpop.Release(t) return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{ Flags: flags, }) } -// Fallocate implements linux system call fallocate(2). -func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := args[0].Int() - mode := args[1].Uint64() - offset := args[2].Int64() - length := args[3].Int64() - - file := t.GetFileVFS2(fd) - - if file == nil { - return 0, nil, syserror.EBADF - } - defer file.DecRef() - - if !file.IsWritable() { - return 0, nil, syserror.EBADF - } - - if mode != 0 { - return 0, nil, syserror.ENOTSUP - } - - if offset < 0 || length <= 0 { - return 0, nil, syserror.EINVAL - } - - size := offset + length - - if size < 0 { - return 0, nil, syserror.EFBIG - } - - limit := limits.FromContext(t).Get(limits.FileSize).Cur - - if uint64(size) >= limit { - t.SendSignal(&arch.SignalInfo{ - Signo: int32(linux.SIGXFSZ), - Code: arch.SignalInfoUser, - }) - return 0, nil, syserror.EFBIG - } - - return 0, nil, file.Impl().Allocate(t, mode, uint64(offset), uint64(length)) - - // File length modified, generate notification. - // TODO(gvisor.dev/issue/1479): Reenable when Inotify is ported. - // file.Dirent.InotifyEvent(linux.IN_MODIFY, 0) -} - // Rmdir implements Linux syscall rmdir(2). func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() @@ -309,7 +259,7 @@ func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop) } @@ -328,7 +278,7 @@ func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop) } @@ -379,6 +329,6 @@ func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpath if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target) } diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go index 317409a18..a7d4d2a36 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fscontext.go +++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go @@ -31,8 +31,8 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal root := t.FSContext().RootDirectoryVFS2() wd := t.FSContext().WorkingDirectoryVFS2() s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd) - root.DecRef() - wd.DecRef() + root.DecRef(t) + wd.DecRef(t) if err != nil { return 0, nil, err } @@ -67,7 +67,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -75,8 +75,8 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - t.FSContext().SetWorkingDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetWorkingDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } @@ -88,7 +88,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -96,8 +96,8 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - t.FSContext().SetWorkingDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetWorkingDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } @@ -117,7 +117,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -125,7 +125,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - t.FSContext().SetRootDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetRootDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go index c7c7bf7ce..5517595b5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/getdents.go +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -44,7 +44,7 @@ func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) cb := getGetdentsCallback(t, addr, size, isGetdents64) err := file.IterDirents(t, cb) diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go index 5d98134a5..11753d8e5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/inotify.go +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -35,7 +35,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if err != nil { return 0, nil, err } - defer ino.DecRef() + defer ino.DecRef(t) fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, @@ -66,7 +66,7 @@ func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, ino, ok := f.Impl().(*vfs.Inotify) if !ok { // Not an inotify fd. - f.DecRef() + f.DecRef(t) return nil, nil, syserror.EINVAL } @@ -96,7 +96,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer f.DecRef() + defer f.DecRef(t) path, err := copyInPath(t, addr) if err != nil { @@ -109,12 +109,12 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{}) if err != nil { return 0, nil, err } - defer d.DecRef() + defer d.DecRef(t) fd, err = ino.AddWatch(d.Dentry(), mask) if err != nil { @@ -132,6 +132,6 @@ func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if err != nil { return 0, nil, err } - defer f.DecRef() - return 0, nil, ino.RmWatch(wd) + defer f.DecRef(t) + return 0, nil, ino.RmWatch(t, wd) } diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index fd6ab94b2..38778a388 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -29,7 +29,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Handle ioctls that apply to all FDs. switch args[1].Int() { diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go index bf19028c4..b910b5a74 100644 --- a/pkg/sentry/syscalls/linux/vfs2/lock.go +++ b/pkg/sentry/syscalls/linux/vfs2/lock.go @@ -32,7 +32,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // flock(2): EBADF fd is not an open file descriptor. return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) nonblocking := operation&linux.LOCK_NB != 0 operation &^= linux.LOCK_NB diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go index bbe248d17..c4c0f9e0a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/memfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go @@ -47,10 +47,11 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } shmMount := t.Kernel().ShmMount() - file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name) + file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name) if err != nil { return 0, nil, err } + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: cloExec, diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go index 60a43f0a0..dc05c2994 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mmap.go +++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go @@ -61,7 +61,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC } defer func() { if opts.MappingIdentity != nil { - opts.MappingIdentity.DecRef() + opts.MappingIdentity.DecRef(t) } }() @@ -71,7 +71,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // mmap unconditionally requires that the FD is readable. if !file.IsReadable() { diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go index ea337de7c..4bd5c7ca2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mount.go +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -108,7 +108,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - defer target.Release() + defer target.Release(t) return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts) } @@ -140,7 +140,7 @@ func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) opts := vfs.UmountOptions{ Flags: uint32(flags), diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go index 97da6c647..90a511d9a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/path.go +++ b/pkg/sentry/syscalls/linux/vfs2/path.go @@ -42,7 +42,7 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA haveStartRef := false if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { - root.DecRef() + root.DecRef(t) return taskPathOperation{}, syserror.ENOENT } if dirfd == linux.AT_FDCWD { @@ -51,13 +51,13 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { - root.DecRef() + root.DecRef(t) return taskPathOperation{}, syserror.EBADF } start = dirfile.VirtualDentry() start.IncRef() haveStartRef = true - dirfile.DecRef() + dirfile.DecRef(t) } } return taskPathOperation{ @@ -71,10 +71,10 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA }, nil } -func (tpop *taskPathOperation) Release() { - tpop.pop.Root.DecRef() +func (tpop *taskPathOperation) Release(t *kernel.Task) { + tpop.pop.Root.DecRef(t) if tpop.haveStartRef { - tpop.pop.Start.DecRef() + tpop.pop.Start.DecRef(t) tpop.haveStartRef = false } } diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go index 4a01e4209..9b4848d9e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/pipe.go +++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go @@ -42,8 +42,8 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error { return syserror.EINVAL } r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK)) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(t) + defer w.DecRef(t) fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -54,7 +54,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error { if _, err := t.CopyOut(addr, fds); err != nil { for _, fd := range fds { if _, file := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return err diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go index ff1b25d7b..7b9d5e18a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/poll.go +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -73,7 +73,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } if ch == nil { - defer file.DecRef() + defer file.DecRef(t) } else { state.file = file state.waiter, _ = waiter.NewChannelEntry(ch) @@ -85,11 +85,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } // releaseState releases all the pollState in "state". -func releaseState(state []pollState) { +func releaseState(t *kernel.Task, state []pollState) { for i := range state { if state[i].file != nil { state[i].file.EventUnregister(&state[i].waiter) - state[i].file.DecRef() + state[i].file.DecRef(t) } } } @@ -110,7 +110,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // result, we stop registering for events but still go through all files // to get their ready masks. state := make([]pollState, len(pfd)) - defer releaseState(state) + defer releaseState(t, state) n := uintptr(0) for i := range pfd { initReadiness(t, &pfd[i], &state[i], ch) @@ -269,7 +269,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add if file == nil { return 0, syserror.EBADF } - file.DecRef() + file.DecRef(t) var mask int16 if (rV & m) != 0 { diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index cd25597a7..a905dae0a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -44,7 +44,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the size is legitimate. si := int(size) @@ -75,7 +75,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Get the destination of the read. dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ @@ -94,7 +94,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt n, err := file.Read(t, dst, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -102,7 +102,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -135,7 +135,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -151,7 +151,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -188,7 +188,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -226,7 +226,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { @@ -258,7 +258,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of n, err := file.PRead(t, dst, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -266,7 +266,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -299,7 +299,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -314,7 +314,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the size is legitimate. si := int(size) @@ -345,7 +345,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Get the source of the write. src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ @@ -364,7 +364,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op n, err := file.Write(t, src, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -372,7 +372,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -405,7 +405,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return total, err } @@ -421,7 +421,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -458,7 +458,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -496,7 +496,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { @@ -528,7 +528,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o n, err := file.PWrite(t, src, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -536,7 +536,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -569,7 +569,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -601,7 +601,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newoff, err := file.Seek(t, offset, whence) return uintptr(newoff), nil, err @@ -617,7 +617,7 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.IsReadable() { diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 6daedd173..5e6eb13ba 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -65,7 +66,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.SetStat(t, vfs.SetStatOptions{ Stat: linux.Statx{ @@ -150,7 +151,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) var opts vfs.SetStatOptions if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { @@ -196,7 +197,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if !file.IsWritable() { return 0, nil, syserror.EINVAL @@ -211,6 +212,56 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, handleSetSizeError(t, err) } +// Fallocate implements linux system call fallocate(2). +func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].Uint64() + offset := args[2].Int64() + length := args[3].Int64() + + file := t.GetFileVFS2(fd) + + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef(t) + + if !file.IsWritable() { + return 0, nil, syserror.EBADF + } + + if mode != 0 { + return 0, nil, syserror.ENOTSUP + } + + if offset < 0 || length <= 0 { + return 0, nil, syserror.EINVAL + } + + size := offset + length + + if size < 0 { + return 0, nil, syserror.EFBIG + } + + limit := limits.FromContext(t).Get(limits.FileSize).Cur + + if uint64(size) >= limit { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(linux.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + if err := file.Allocate(t, mode, uint64(offset), uint64(length)); err != nil { + return 0, nil, err + } + + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) + return 0, nil, nil +} + // Utime implements Linux syscall utime(2). func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() @@ -387,7 +438,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { @@ -395,7 +446,7 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -406,13 +457,13 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa // VirtualFilesystem.SetStatAt(), since the former may be able // to use opened file state to expedite the SetStat. err := dirfile.SetStat(t, *opts) - dirfile.DecRef() + dirfile.DecRef(t) return err } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{ diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go index 623992f6f..b89f34cdb 100644 --- a/pkg/sentry/syscalls/linux/vfs2/signal.go +++ b/pkg/sentry/syscalls/linux/vfs2/signal.go @@ -45,7 +45,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is this a signalfd? if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok { @@ -68,7 +68,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) // Create a new descriptor. fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{ diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index 8096a8f9c..4a68c64f3 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -196,7 +196,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if e != nil { return 0, nil, e.ToError() } - defer s.DecRef() + defer s.DecRef(t) if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil { return 0, nil, err @@ -230,8 +230,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, e.ToError() } // Adding to the FD table will cause an extra reference to be acquired. - defer s1.DecRef() - defer s2.DecRef() + defer s1.DecRef(t) + defer s2.DecRef(t) nonblocking := uint32(stype & linux.SOCK_NONBLOCK) if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { @@ -253,7 +253,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if _, err := t.CopyOut(addr, fds); err != nil { for _, fd := range fds { if _, file := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return 0, nil, err @@ -273,7 +273,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -304,7 +304,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -363,7 +363,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -390,7 +390,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -419,7 +419,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -450,7 +450,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -532,7 +532,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -570,7 +570,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -598,7 +598,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -631,7 +631,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -684,7 +684,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -778,7 +778,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC - cms.Release() + cms.Release(t) } if int(msg.Flags) != mflags { @@ -798,7 +798,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } - defer cms.Release() + defer cms.Release(t) controlData := make([]byte, 0, msg.ControlLen) controlData = control.PackControlMessages(t, cms, controlData) @@ -854,7 +854,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -883,7 +883,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag } n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) - cm.Release() + cm.Release(t) if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } @@ -927,7 +927,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -965,7 +965,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -1069,7 +1069,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) if err != nil { - controlMessages.Release() + controlMessages.Release(t) } return uintptr(n), err } @@ -1087,7 +1087,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index 63ab11f8c..75bfa2c79 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -53,12 +53,12 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { @@ -175,7 +175,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // On Linux, inotify behavior is not very consistent with splice(2). We try // our best to emulate Linux for very basic calls to splice, where for some // reason, events are generated for output files, but not input files. - outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) return uintptr(n), nil, nil } @@ -203,12 +203,12 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { @@ -251,7 +251,7 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if n == 0 { return 0, nil, err } - outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) return uintptr(n), nil, nil } @@ -266,7 +266,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) if !inFile.IsReadable() { return 0, nil, syserror.EBADF } @@ -275,7 +275,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) if !outFile.IsWritable() { return 0, nil, syserror.EBADF } @@ -347,6 +347,11 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } else { spliceN, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{}) } + if spliceN == 0 && err == io.EOF { + // We reached the end of the file. Eat the error and exit the loop. + err = nil + break + } n += spliceN if err == syserror.ErrWouldBlock && !nonBlock { err = dw.waitForBoth(t) @@ -367,8 +372,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) } if readN == 0 && err == io.EOF { - // We reached the end of the file. Eat the - // error and exit the loop. + // We reached the end of the file. Eat the error and exit the loop. err = nil break } @@ -419,8 +423,8 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } - inFile.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) - outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) return uintptr(n), nil, nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index bb1d5cac4..0f5d5189c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -65,7 +65,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { @@ -73,7 +73,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -85,7 +85,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags // former may be able to use opened file state to expedite the // Stat. statx, err := dirfile.Stat(t, opts) - dirfile.DecRef() + dirfile.DecRef(t) if err != nil { return err } @@ -96,8 +96,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } @@ -132,7 +132,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) statx, err := file.Stat(t, vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS, @@ -177,7 +177,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { @@ -185,7 +185,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -197,7 +197,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // former may be able to use opened file state to expedite the // Stat. statx, err := dirfile.Stat(t, opts) - dirfile.DecRef() + dirfile.DecRef(t) if err != nil { return 0, nil, err } @@ -207,8 +207,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } @@ -282,7 +282,7 @@ func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) err if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) // access(2) and faccessat(2) check permissions using real // UID/GID, not effective UID/GID. @@ -328,7 +328,7 @@ func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, siz if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop) if err != nil { @@ -358,7 +358,7 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) if err != nil { @@ -377,7 +377,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) if err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go index 0d0ebf46a..a6491ac37 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sync.go +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -34,7 +34,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.SyncFS(t) } @@ -47,7 +47,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.Sync(t) } @@ -77,7 +77,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support // is a full-file sync, i.e. fsync(2). As a result, there are severe diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go index 5ac79bc09..7a26890ef 100644 --- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -50,11 +50,11 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } vfsObj := t.Kernel().VFS() - file, err := timerfd.New(vfsObj, clock, fileFlags) + file, err := timerfd.New(t, vfsObj, clock, fileFlags) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.TFD_CLOEXEC != 0, }) @@ -79,7 +79,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { @@ -113,7 +113,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go index af455d5c1..ef99246ed 100644 --- a/pkg/sentry/syscalls/linux/vfs2/xattr.go +++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go @@ -49,7 +49,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size)) if err != nil { @@ -72,7 +72,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) names, err := file.Listxattr(t, uint64(size)) if err != nil { @@ -109,7 +109,7 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -141,7 +141,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -188,7 +188,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -222,7 +222,7 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -262,7 +262,7 @@ func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSy if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -281,7 +281,7 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 641e3e502..5a0e3e6b5 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -82,7 +82,7 @@ type anonDentry struct { } // Release implements FilesystemImpl.Release. -func (fs *anonFilesystem) Release() { +func (fs *anonFilesystem) Release(ctx context.Context) { } // Sync implements FilesystemImpl.Sync. @@ -294,7 +294,7 @@ func (d *anonDentry) TryIncRef() bool { } // DecRef implements DentryImpl.DecRef. -func (d *anonDentry) DecRef() { +func (d *anonDentry) DecRef(ctx context.Context) { // no-op } @@ -303,7 +303,7 @@ func (d *anonDentry) DecRef() { // Although Linux technically supports inotify on pseudo filesystems (inotify // is implemented at the vfs layer), it is not particularly useful. It is left // unimplemented until someone actually needs it. -func (d *anonDentry) InotifyWithParent(events, cookie uint32, et EventType) {} +func (d *anonDentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) {} // Watches implements DentryImpl.Watches. func (d *anonDentry) Watches() *Watches { @@ -311,4 +311,4 @@ func (d *anonDentry) Watches() *Watches { } // OnZeroWatches implements Dentry.OnZeroWatches. -func (d *anonDentry) OnZeroWatches() {} +func (d *anonDentry) OnZeroWatches(context.Context) {} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index cea3e6955..bc7ea93ea 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -17,6 +17,7 @@ package vfs import ( "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -102,7 +103,7 @@ type DentryImpl interface { TryIncRef() bool // DecRef decrements the Dentry's reference count. - DecRef() + DecRef(ctx context.Context) // InotifyWithParent notifies all watches on the targets represented by this // dentry and its parent. The parent's watches are notified first, followed @@ -113,7 +114,7 @@ type DentryImpl interface { // // Note that the events may not actually propagate up to the user, depending // on the event masks. - InotifyWithParent(events, cookie uint32, et EventType) + InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) // Watches returns the set of inotify watches for the file corresponding to // the Dentry. Dentries that are hard links to the same underlying file @@ -135,7 +136,7 @@ type DentryImpl interface { // The caller does not need to hold a reference on the dentry. OnZeroWatches // may acquire inotify locks, so to prevent deadlock, no inotify locks should // be held by the caller. - OnZeroWatches() + OnZeroWatches(ctx context.Context) } // IncRef increments d's reference count. @@ -150,8 +151,8 @@ func (d *Dentry) TryIncRef() bool { } // DecRef decrements d's reference count. -func (d *Dentry) DecRef() { - d.impl.DecRef() +func (d *Dentry) DecRef(ctx context.Context) { + d.impl.DecRef(ctx) } // IsDead returns true if d has been deleted or invalidated by its owning @@ -168,8 +169,8 @@ func (d *Dentry) isMounted() bool { // InotifyWithParent notifies all watches on the targets represented by d and // its parent of events. -func (d *Dentry) InotifyWithParent(events, cookie uint32, et EventType) { - d.impl.InotifyWithParent(events, cookie, et) +func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) { + d.impl.InotifyWithParent(ctx, events, cookie, et) } // Watches returns the set of inotify watches associated with d. @@ -182,8 +183,8 @@ func (d *Dentry) Watches() *Watches { // OnZeroWatches performs cleanup tasks whenever the number of watches on a // dentry drops to zero. -func (d *Dentry) OnZeroWatches() { - d.impl.OnZeroWatches() +func (d *Dentry) OnZeroWatches(ctx context.Context) { + d.impl.OnZeroWatches(ctx) } // The following functions are exported so that filesystem implementations can @@ -214,11 +215,11 @@ func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { // CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion // succeeds. -func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { +func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) { d.dead = true d.mu.Unlock() if d.isMounted() { - vfs.forgetDeadMountpoint(d) + vfs.forgetDeadMountpoint(ctx, d) } } @@ -226,12 +227,12 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { // did for reasons outside of VFS' control (e.g. d represents the local state // of a file on a remote filesystem on which the file has already been // deleted). -func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) { +func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) { d.mu.Lock() d.dead = true d.mu.Unlock() if d.isMounted() { - vfs.forgetDeadMountpoint(d) + vfs.forgetDeadMountpoint(ctx, d) } } @@ -278,13 +279,13 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { // that was replaced by from. // // Preconditions: PrepareRenameDentry was previously called on from and to. -func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) { +func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) { from.mu.Unlock() if to != nil { to.dead = true to.mu.Unlock() if to.isMounted() { - vfs.forgetDeadMountpoint(to) + vfs.forgetDeadMountpoint(ctx, to) } } } @@ -303,7 +304,7 @@ func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { // // forgetDeadMountpoint is analogous to Linux's // fs/namespace.c:__detach_mounts(). -func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) { +func (vfs *VirtualFilesystem) forgetDeadMountpoint(ctx context.Context, d *Dentry) { var ( vdsToDecRef []VirtualDentry mountsToDecRef []*Mount @@ -316,9 +317,9 @@ func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) { vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } } diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 5b009b928..1b5af9f73 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -93,9 +93,9 @@ type epollInterest struct { // NewEpollInstanceFD returns a FileDescription representing a new epoll // instance. A reference is taken on the returned FileDescription. -func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) { +func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) { vd := vfs.NewAnonVirtualDentry("[eventpoll]") - defer vd.DecRef() + defer vd.DecRef(ctx) ep := &EpollInstance{ interest: make(map[epollInterestKey]*epollInterest), } @@ -110,7 +110,7 @@ func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) { } // Release implements FileDescriptionImpl.Release. -func (ep *EpollInstance) Release() { +func (ep *EpollInstance) Release(ctx context.Context) { // Unregister all polled fds. ep.interestMu.Lock() defer ep.interestMu.Unlock() diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 0c42574db..d3c1197e3 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -171,7 +171,7 @@ func (fd *FileDescription) TryIncRef() bool { } // DecRef decrements fd's reference count. -func (fd *FileDescription) DecRef() { +func (fd *FileDescription) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { // Unregister fd from all epoll instances. fd.epollMu.Lock() @@ -196,11 +196,11 @@ func (fd *FileDescription) DecRef() { } // Release implementation resources. - fd.impl.Release() + fd.impl.Release(ctx) if fd.writable { fd.vd.mount.EndWrite() } - fd.vd.DecRef() + fd.vd.DecRef(ctx) fd.flagsMu.Lock() // TODO(gvisor.dev/issue/1663): We may need to unregister during save, as we do in VFS1. if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil { @@ -335,7 +335,7 @@ func (fd *FileDescription) Impl() FileDescriptionImpl { type FileDescriptionImpl interface { // Release is called when the associated FileDescription reaches zero // references. - Release() + Release(ctx context.Context) // OnClose is called when a file descriptor representing the // FileDescription is closed. Note that returning a non-nil error does not @@ -354,8 +354,10 @@ type FileDescriptionImpl interface { // represented by the FileDescription. StatFS(ctx context.Context) (linux.Statfs, error) - // Allocate grows file represented by FileDescription to offset + length bytes. + // Allocate grows the file to offset + length bytes. // Only mode == 0 is supported currently. + // + // Preconditions: The FileDescription was opened for writing. Allocate(ctx context.Context, mode, offset, length uint64) error // waiter.Waitable methods may be used to poll for I/O events. @@ -526,7 +528,7 @@ func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.St Start: fd.vd, }) stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return stat, err } return fd.impl.Stat(ctx, opts) @@ -541,7 +543,7 @@ func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) err Start: fd.vd, }) err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return err } return fd.impl.SetStat(ctx, opts) @@ -557,12 +559,20 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { Start: fd.vd, }) statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return statfs, err } return fd.impl.StatFS(ctx) } +// Allocate grows file represented by FileDescription to offset + length bytes. +func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { + if !fd.IsWritable() { + return syserror.EBADF + } + return fd.impl.Allocate(ctx, mode, offset, length) +} + // Readiness implements waiter.Waitable.Readiness. // // It returns fd's I/O readiness. @@ -669,7 +679,7 @@ func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string Start: fd.vd, }) names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return names, err } names, err := fd.impl.Listxattr(ctx, size) @@ -698,7 +708,7 @@ func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) Start: fd.vd, }) val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return val, err } return fd.impl.Getxattr(ctx, *opts) @@ -714,7 +724,7 @@ func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) Start: fd.vd, }) err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return err } return fd.impl.Setxattr(ctx, *opts) @@ -730,7 +740,7 @@ func (fd *FileDescription) Removexattr(ctx context.Context, name string) error { Start: fd.vd, }) err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return err } return fd.impl.Removexattr(ctx, name) @@ -747,7 +757,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string { vfsroot := RootFromContext(ctx) s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd) if vfsroot.Ok() { - vfsroot.DecRef() + vfsroot.DecRef(ctx) } return s } diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 3b7e1c273..1cd607c0a 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -80,9 +80,9 @@ type testFD struct { data DynamicBytesSource } -func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription { +func newTestFD(ctx context.Context, vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription { vd := vfsObj.NewAnonVirtualDentry("genCountFD") - defer vd.DecRef() + defer vd.DecRef(ctx) var fd testFD fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}) fd.DynamicBytesFileDescriptionImpl.SetDataSource(data) @@ -90,7 +90,7 @@ func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesS } // Release implements FileDescriptionImpl.Release. -func (fd *testFD) Release() { +func (fd *testFD) Release(context.Context) { } // SetStatusFlags implements FileDescriptionImpl.SetStatusFlags. @@ -109,11 +109,11 @@ func TestGenCountFD(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } - fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{}) - defer fd.DecRef() + fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &genCount{}) + defer fd.DecRef(ctx) // The first read causes Generate to be called to fill the FD's buffer. buf := make([]byte, 2) @@ -167,11 +167,11 @@ func TestWritable(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } - fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"}) - defer fd.DecRef() + fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &storeData{data: "init"}) + defer fd.DecRef(ctx) buf := make([]byte, 10) ioseq := usermem.BytesIOSequence(buf) diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 6bb9ca180..df3758fd1 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -100,12 +100,12 @@ func (fs *Filesystem) TryIncRef() bool { } // DecRef decrements fs' reference count. -func (fs *Filesystem) DecRef() { +func (fs *Filesystem) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { fs.vfs.filesystemsMu.Lock() delete(fs.vfs.filesystems, fs) fs.vfs.filesystemsMu.Unlock() - fs.impl.Release() + fs.impl.Release(ctx) } else if refs < 0 { panic("Filesystem.decRef() called without holding a reference") } @@ -149,7 +149,7 @@ func (fs *Filesystem) DecRef() { type FilesystemImpl interface { // Release is called when the associated Filesystem reaches zero // references. - Release() + Release(ctx context.Context) // Sync "causes all pending modifications to filesystem metadata and cached // file data to be written to the underlying [filesystem]", as by syncfs(2). diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 167b731ac..aff220a61 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -100,7 +100,7 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) id := uniqueid.GlobalFromContext(ctx) vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id)) - defer vd.DecRef() + defer vd.DecRef(ctx) fd := &Inotify{ id: id, scratch: make([]byte, inotifyEventBaseSize), @@ -118,7 +118,7 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) // Release implements FileDescriptionImpl.Release. Release removes all // watches and frees all resources for an inotify instance. -func (i *Inotify) Release() { +func (i *Inotify) Release(ctx context.Context) { var ds []*Dentry // We need to hold i.mu to avoid a race with concurrent calls to @@ -144,7 +144,7 @@ func (i *Inotify) Release() { i.mu.Unlock() for _, d := range ds { - d.OnZeroWatches() + d.OnZeroWatches(ctx) } } @@ -350,7 +350,7 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) { // RmWatch looks up an inotify watch for the given 'wd' and configures the // target to stop sending events to this inotify instance. -func (i *Inotify) RmWatch(wd int32) error { +func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { i.mu.Lock() // Find the watch we were asked to removed. @@ -374,7 +374,7 @@ func (i *Inotify) RmWatch(wd int32) error { i.mu.Unlock() if remaining == 0 { - w.target.OnZeroWatches() + w.target.OnZeroWatches(ctx) } // Generate the event for the removal. @@ -462,7 +462,7 @@ func (w *Watches) Remove(id uint64) { // Notify queues a new event with watches in this set. Watches with // IN_EXCL_UNLINK are skipped if the event is coming from a child that has been // unlinked. -func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlinked bool) { +func (w *Watches) Notify(ctx context.Context, name string, events, cookie uint32, et EventType, unlinked bool) { var hasExpired bool w.mu.RLock() for _, watch := range w.ws { @@ -476,13 +476,13 @@ func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlin w.mu.RUnlock() if hasExpired { - w.cleanupExpiredWatches() + w.cleanupExpiredWatches(ctx) } } // This function is relatively expensive and should only be called where there // are expired watches. -func (w *Watches) cleanupExpiredWatches() { +func (w *Watches) cleanupExpiredWatches(ctx context.Context) { // Because of lock ordering, we cannot acquire Inotify.mu for each watch // owner while holding w.mu. As a result, store expired watches locally // before removing. @@ -495,15 +495,15 @@ func (w *Watches) cleanupExpiredWatches() { } w.mu.RUnlock() for _, watch := range toRemove { - watch.owner.RmWatch(watch.wd) + watch.owner.RmWatch(ctx, watch.wd) } } // HandleDeletion is called when the watch target is destroyed. Clear the // watch set, detach watches from the inotify instances they belong to, and // generate the appropriate events. -func (w *Watches) HandleDeletion() { - w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) +func (w *Watches) HandleDeletion(ctx context.Context) { + w.Notify(ctx, "", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) // As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for // the owner of each watch being deleted. Instead, atomically store the @@ -744,12 +744,12 @@ func InotifyEventFromStatMask(mask uint32) uint32 { // InotifyRemoveChild sends the appriopriate notifications to the watch sets of // the child being removed and its parent. Note that unlike most pairs of // parent/child notifications, the child is notified first in this case. -func InotifyRemoveChild(self, parent *Watches, name string) { +func InotifyRemoveChild(ctx context.Context, self, parent *Watches, name string) { if self != nil { - self.Notify("", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */) + self.Notify(ctx, "", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */) } if parent != nil { - parent.Notify(name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */) + parent.Notify(ctx, name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */) } } @@ -762,13 +762,13 @@ func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, } cookie := uniqueid.InotifyCookie(ctx) if oldParent != nil { - oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */) + oldParent.Notify(ctx, oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */) } if newParent != nil { - newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */) + newParent.Notify(ctx, newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */) } // Somewhat surprisingly, self move events do not have a cookie. if renamed != nil { - renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */) + renamed.Notify(ctx, "", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */) } } diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 32f901bd8..d1d29d0cd 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -200,8 +200,8 @@ func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth if err != nil { return nil, err } - defer root.DecRef() - defer fs.DecRef() + defer root.DecRef(ctx) + defer fs.DecRef(ctx) return vfs.NewDisconnectedMount(fs, root, opts) } @@ -221,7 +221,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr if vd.dentry.dead { vd.dentry.mu.Unlock() vfs.mountMu.Unlock() - vd.DecRef() + vd.DecRef(ctx) return syserror.ENOENT } // vd might have been mounted over between vfs.GetDentryAt() and @@ -243,7 +243,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr // This can't fail since we're holding vfs.mountMu. nextmnt.root.IncRef() vd.dentry.mu.Unlock() - vd.DecRef() + vd.DecRef(ctx) vd = VirtualDentry{ mount: nextmnt, dentry: nextmnt.root, @@ -268,7 +268,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia if err != nil { return err } - defer mnt.DecRef() + defer mnt.DecRef(ctx) if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { return err } @@ -293,13 +293,13 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti if err != nil { return err } - defer vd.DecRef() + defer vd.DecRef(ctx) if vd.dentry != vd.mount.root { return syserror.EINVAL } vfs.mountMu.Lock() if mntns := MountNamespaceFromContext(ctx); mntns != nil { - defer mntns.DecRef() + defer mntns.DecRef(ctx) if mntns != vd.mount.ns { vfs.mountMu.Unlock() return syserror.EINVAL @@ -335,10 +335,10 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } return nil } @@ -479,7 +479,7 @@ func (mnt *Mount) IncRef() { } // DecRef decrements mnt's reference count. -func (mnt *Mount) DecRef() { +func (mnt *Mount) DecRef(ctx context.Context) { refs := atomic.AddInt64(&mnt.refs, -1) if refs&^math.MinInt64 == 0 { // mask out MSB var vd VirtualDentry @@ -490,10 +490,10 @@ func (mnt *Mount) DecRef() { mnt.vfs.mounts.seq.EndWrite() mnt.vfs.mountMu.Unlock() } - mnt.root.DecRef() - mnt.fs.DecRef() + mnt.root.DecRef(ctx) + mnt.fs.DecRef(ctx) if vd.Ok() { - vd.DecRef() + vd.DecRef(ctx) } } } @@ -506,7 +506,7 @@ func (mntns *MountNamespace) IncRef() { } // DecRef decrements mntns' reference count. -func (mntns *MountNamespace) DecRef() { +func (mntns *MountNamespace) DecRef(ctx context.Context) { vfs := mntns.root.fs.VirtualFilesystem() if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { vfs.mountMu.Lock() @@ -517,10 +517,10 @@ func (mntns *MountNamespace) DecRef() { vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } } else if refs < 0 { panic("MountNamespace.DecRef() called without holding a reference") @@ -534,7 +534,7 @@ func (mntns *MountNamespace) DecRef() { // getMountAt is analogous to Linux's fs/namei.c:follow_mount(). // // Preconditions: References are held on mnt and d. -func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount { +func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount { // The first mount is special-cased: // // - The caller is assumed to have checked d.isMounted() already. (This @@ -565,7 +565,7 @@ retryFirst: // Raced with umount. continue } - mnt.DecRef() + mnt.DecRef(ctx) mnt = next d = next.root } @@ -578,7 +578,7 @@ retryFirst: // // Preconditions: References are held on mnt and root. vfsroot is not (mnt, // mnt.root). -func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry { +func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry { // The first mount is special-cased: // // - The caller must have already checked mnt against vfsroot. @@ -602,12 +602,12 @@ retryFirst: if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can only // happen due to a racing change to Mount.key. - parent.DecRef() + parent.DecRef(ctx) goto retryFirst } if !vfs.mounts.seq.ReadOk(epoch) { - point.DecRef() - parent.DecRef() + point.DecRef(ctx) + parent.DecRef(ctx) goto retryFirst } mnt = parent @@ -635,16 +635,16 @@ retryFirst: if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can // only happen due to a racing change to Mount.key. - parent.DecRef() + parent.DecRef(ctx) goto retryNotFirst } if !vfs.mounts.seq.ReadOk(epoch) { - point.DecRef() - parent.DecRef() + point.DecRef(ctx) + parent.DecRef(ctx) goto retryNotFirst } - d.DecRef() - mnt.DecRef() + d.DecRef(ctx) + mnt.DecRef(ctx) mnt = parent d = point } diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index cd78d66bc..e4da15009 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -47,7 +47,7 @@ func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() @@ -64,12 +64,12 @@ loop: // of FilesystemImpl.PrependPath() may return nil instead. break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { break loop } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true @@ -101,7 +101,7 @@ func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() loop: @@ -112,12 +112,12 @@ loop: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { return "", nil } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true @@ -145,7 +145,7 @@ func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() unreachable := false @@ -157,13 +157,13 @@ loop: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { unreachable = true break loop } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 9d047ff88..3304372d9 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" @@ -136,31 +137,31 @@ func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *Pat return rp } -func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { +func (vfs *VirtualFilesystem) putResolvingPath(ctx context.Context, rp *ResolvingPath) { rp.root = VirtualDentry{} - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = nil rp.start = nil - rp.releaseErrorState() + rp.releaseErrorState(ctx) resolvingPathPool.Put(rp) } -func (rp *ResolvingPath) decRefStartAndMount() { +func (rp *ResolvingPath) decRefStartAndMount(ctx context.Context) { if rp.flags&rpflagsHaveStartRef != 0 { - rp.start.DecRef() + rp.start.DecRef(ctx) } if rp.flags&rpflagsHaveMountRef != 0 { - rp.mount.DecRef() + rp.mount.DecRef(ctx) } } -func (rp *ResolvingPath) releaseErrorState() { +func (rp *ResolvingPath) releaseErrorState(ctx context.Context) { if rp.nextStart != nil { - rp.nextStart.DecRef() + rp.nextStart.DecRef(ctx) rp.nextStart = nil } if rp.nextMount != nil { - rp.nextMount.DecRef() + rp.nextMount.DecRef(ctx) rp.nextMount = nil } } @@ -236,13 +237,13 @@ func (rp *ResolvingPath) Advance() { // Restart resets the stream of path components represented by rp to its state // on entry to the current FilesystemImpl method. -func (rp *ResolvingPath) Restart() { +func (rp *ResolvingPath) Restart(ctx context.Context) { rp.pit = rp.origParts[rp.numOrigParts-1] rp.mustBeDir = rp.mustBeDirOrig rp.symlinks = rp.symlinksOrig rp.curPart = rp.numOrigParts - 1 copy(rp.parts[:], rp.origParts[:rp.numOrigParts]) - rp.releaseErrorState() + rp.releaseErrorState(ctx) } func (rp *ResolvingPath) relpathCommit() { @@ -260,13 +261,13 @@ func (rp *ResolvingPath) relpathCommit() { // Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path // resolution should resolve d's parent normally, and CheckRoot returns (false, // nil). -func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) { +func (rp *ResolvingPath) CheckRoot(ctx context.Context, d *Dentry) (bool, error) { if d == rp.root.dentry && rp.mount == rp.root.mount { // At contextual VFS root (due to e.g. chroot(2)). return true, nil } else if d == rp.mount.root { // At mount root ... - vd := rp.vfs.getMountpointAt(rp.mount, rp.root) + vd := rp.vfs.getMountpointAt(ctx, rp.mount, rp.root) if vd.Ok() { // ... of non-root mount. rp.nextMount = vd.mount @@ -283,11 +284,11 @@ func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) { // to d. If d is a mount point, such that path resolution should switch to // another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount // returns nil. -func (rp *ResolvingPath) CheckMount(d *Dentry) error { +func (rp *ResolvingPath) CheckMount(ctx context.Context, d *Dentry) error { if !d.isMounted() { return nil } - if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil { + if mnt := rp.vfs.getMountAt(ctx, rp.mount, d); mnt != nil { rp.nextMount = mnt return resolveMountPointError{} } @@ -389,11 +390,11 @@ func (rp *ResolvingPath) HandleJump(target VirtualDentry) error { return resolveMountRootOrJumpError{} } -func (rp *ResolvingPath) handleError(err error) bool { +func (rp *ResolvingPath) handleError(ctx context.Context, err error) bool { switch err.(type) { case resolveMountRootOrJumpError: // Switch to the new Mount. We hold references on the Mount and Dentry. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.nextMount rp.start = rp.nextStart rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef @@ -412,7 +413,7 @@ func (rp *ResolvingPath) handleError(err error) bool { case resolveMountPointError: // Switch to the new Mount. We hold a reference on the Mount, but // borrow the reference on the mount root from the Mount. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.nextMount rp.start = rp.nextMount.root rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef @@ -423,12 +424,12 @@ func (rp *ResolvingPath) handleError(err error) bool { // path. rp.relpathCommit() // Restart path resolution on the new Mount. - rp.releaseErrorState() + rp.releaseErrorState(ctx) return true case resolveAbsSymlinkError: // Switch to the new Mount. References are borrowed from rp.root. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.root.mount rp.start = rp.root.dentry rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef @@ -440,7 +441,7 @@ func (rp *ResolvingPath) handleError(err error) bool { // path, including the symlink target we just prepended. rp.relpathCommit() // Restart path resolution on the new Mount. - rp.releaseErrorState() + rp.releaseErrorState(ctx) return true default: diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 522e27475..9c2420683 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -122,7 +122,7 @@ type VirtualFilesystem struct { } // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. -func (vfs *VirtualFilesystem) Init() error { +func (vfs *VirtualFilesystem) Init(ctx context.Context) error { if vfs.mountpoints != nil { panic("VFS already initialized") } @@ -145,7 +145,7 @@ func (vfs *VirtualFilesystem) Init() error { devMinor: anonfsDevMinor, } anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs) - defer anonfs.vfsfs.DecRef() + defer anonfs.vfsfs.DecRef(ctx) anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) if err != nil { // We should not be passing any MountOptions that would cause @@ -192,11 +192,11 @@ func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -214,11 +214,11 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede dentry: d, } rp.mount.IncRef() - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return vd, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return VirtualDentry{}, err } } @@ -236,7 +236,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au } rp.mount.IncRef() name := rp.Component() - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return parentVD, name, nil } if checkInvariants { @@ -244,8 +244,8 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return VirtualDentry{}, "", err } } @@ -260,14 +260,14 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential } if !newpop.Path.Begin.Ok() { - oldVD.DecRef() + oldVD.DecRef(ctx) if newpop.Path.Absolute { return syserror.EEXIST } return syserror.ENOENT } if newpop.FollowFinalSymlink { - oldVD.DecRef() + oldVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") return syserror.EINVAL } @@ -276,8 +276,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential for { err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) if err == nil { - vfs.putResolvingPath(rp) - oldVD.DecRef() + vfs.putResolvingPath(ctx, rp) + oldVD.DecRef(ctx) return nil } if checkInvariants { @@ -285,9 +285,9 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - oldVD.DecRef() + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) + oldVD.DecRef(ctx) return err } } @@ -313,7 +313,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -321,8 +321,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -346,7 +346,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -354,8 +354,8 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -408,31 +408,31 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential for { fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) if opts.FileExec { if fd.Mount().Flags.NoExec { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EACCES } // Only a regular file can be executed. stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) if err != nil { - fd.DecRef() + fd.DecRef(ctx) return nil, err } if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EACCES } } - fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent) + fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent) return fd, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } @@ -444,11 +444,11 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden for { target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return target, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return "", err } } @@ -472,19 +472,19 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti return err } if oldName == "." || oldName == ".." { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) return syserror.EBUSY } if !newpop.Path.Begin.Ok() { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) if newpop.Path.Absolute { return syserror.EBUSY } return syserror.ENOENT } if newpop.FollowFinalSymlink { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") return syserror.EINVAL } @@ -497,8 +497,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) if err == nil { - vfs.putResolvingPath(rp) - oldParentVD.DecRef() + vfs.putResolvingPath(ctx, rp) + oldParentVD.DecRef(ctx) return nil } if checkInvariants { @@ -506,9 +506,9 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - oldParentVD.DecRef() + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) + oldParentVD.DecRef(ctx) return err } } @@ -531,7 +531,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.RmdirAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -539,8 +539,8 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -552,11 +552,11 @@ func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credent for { err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -568,11 +568,11 @@ func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credential for { stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return stat, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return linux.Statx{}, err } } @@ -585,11 +585,11 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti for { statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return statfs, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return linux.Statfs{}, err } } @@ -612,7 +612,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent for { err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -620,8 +620,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -644,7 +644,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.UnlinkAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -652,8 +652,8 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -671,7 +671,7 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C for { bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return bep, nil } if checkInvariants { @@ -679,8 +679,8 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } @@ -693,7 +693,7 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede for { names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return names, nil } if err == syserror.ENOTSUP { @@ -701,11 +701,11 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede // fs/xattr.c:vfs_listxattr() falls back to allowing the security // subsystem to return security extended attributes, which by // default don't exist. - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } @@ -718,11 +718,11 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden for { val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return val, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return "", err } } @@ -735,11 +735,11 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden for { err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -751,11 +751,11 @@ func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Cre for { err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -777,7 +777,7 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { if err := fs.impl.Sync(ctx); err != nil && retErr == nil { retErr = err } - fs.DecRef() + fs.DecRef(ctx) } return retErr } @@ -831,9 +831,9 @@ func (vd VirtualDentry) IncRef() { // DecRef decrements the reference counts on the Mount and Dentry represented // by vd. -func (vd VirtualDentry) DecRef() { - vd.dentry.DecRef() - vd.mount.DecRef() +func (vd VirtualDentry) DecRef(ctx context.Context) { + vd.dentry.DecRef(ctx) + vd.mount.DecRef(ctx) } // Mount returns the Mount associated with vd. It does not take a reference on diff --git a/pkg/syncevent/waiter_amd64.s b/pkg/syncevent/waiter_amd64.s index 985b56ae5..5e216b045 100644 --- a/pkg/syncevent/waiter_amd64.s +++ b/pkg/syncevent/waiter_amd64.s @@ -16,9 +16,9 @@ // See waiter_noasm_unsafe.go for a description of waiterUnlock. // -// func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool +// func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool TEXT ·waiterUnlock(SB),NOSPLIT,$0-24 - MOVQ g+0(FP), DI + MOVQ ptr+0(FP), DI MOVQ wg+8(FP), SI MOVQ $·preparingG(SB), AX diff --git a/pkg/syncevent/waiter_arm64.s b/pkg/syncevent/waiter_arm64.s index 20d7ac23b..f4c06f194 100644 --- a/pkg/syncevent/waiter_arm64.s +++ b/pkg/syncevent/waiter_arm64.s @@ -16,11 +16,11 @@ // See waiter_noasm_unsafe.go for a description of waiterUnlock. // -// func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool +// func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool TEXT ·waiterUnlock(SB),NOSPLIT,$0-24 MOVD wg+8(FP), R0 MOVD $·preparingG(SB), R1 - MOVD g+0(FP), R2 + MOVD ptr+0(FP), R2 again: LDAXR (R0), R3 CMP R1, R3 diff --git a/pkg/syncevent/waiter_asm_unsafe.go b/pkg/syncevent/waiter_asm_unsafe.go index 0995e9053..19d6b0b15 100644 --- a/pkg/syncevent/waiter_asm_unsafe.go +++ b/pkg/syncevent/waiter_asm_unsafe.go @@ -21,4 +21,4 @@ import ( ) // See waiter_noasm_unsafe.go for a description of waiterUnlock. -func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool +func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool diff --git a/pkg/syncevent/waiter_noasm_unsafe.go b/pkg/syncevent/waiter_noasm_unsafe.go index 1c4b0e39a..0f74a689c 100644 --- a/pkg/syncevent/waiter_noasm_unsafe.go +++ b/pkg/syncevent/waiter_noasm_unsafe.go @@ -32,8 +32,8 @@ import ( // should be aborted. // //go:nosplit -func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool { +func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool { // The only way this CAS can fail is if a call to Waiter.NotifyPending() // has replaced *wg with nil, in which case we should not sleep. - return atomic.CompareAndSwapPointer(wg, (unsafe.Pointer)(&preparingG), g) + return atomic.CompareAndSwapPointer(wg, (unsafe.Pointer)(&preparingG), ptr) } diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index c73072c42..798e07b01 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -61,6 +61,7 @@ var ( ENOMEM = error(syscall.ENOMEM) ENOSPC = error(syscall.ENOSPC) ENOSYS = error(syscall.ENOSYS) + ENOTCONN = error(syscall.ENOTCONN) ENOTDIR = error(syscall.ENOTDIR) ENOTEMPTY = error(syscall.ENOTEMPTY) ENOTSOCK = error(syscall.ENOTSOCK) diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go index 62ac932bb..d0d1efd0d 100644 --- a/pkg/tcpip/header/ipv4.go +++ b/pkg/tcpip/header/ipv4.go @@ -101,6 +101,11 @@ const ( // IPv4Version is the version of the ipv4 protocol. IPv4Version = 4 + // IPv4AllSystems is the all systems IPv4 multicast address as per + // IANA's IPv4 Multicast Address Space Registry. See + // https://www.iana.org/assignments/multicast-addresses/multicast-addresses.xhtml. + IPv4AllSystems tcpip.Address = "\xe0\x00\x00\x01" + // IPv4Broadcast is the broadcast address of the IPv4 procotol. IPv4Broadcast tcpip.Address = "\xff\xff\xff\xff" diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD index e0db6cf54..6c137f693 100644 --- a/pkg/tcpip/link/tun/BUILD +++ b/pkg/tcpip/link/tun/BUILD @@ -12,6 +12,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/refs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go index 04ae58e59..22b0a12bd 100644 --- a/pkg/tcpip/link/tun/device.go +++ b/pkg/tcpip/link/tun/device.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -64,14 +65,14 @@ func (d *Device) beforeSave() { } // Release implements fs.FileOperations.Release. -func (d *Device) Release() { +func (d *Device) Release(ctx context.Context) { d.mu.Lock() defer d.mu.Unlock() // Decrease refcount if there is an endpoint associated with this file. if d.endpoint != nil { d.endpoint.RemoveNotify(d.notifyHandle) - d.endpoint.DecRef() + d.endpoint.DecRef(ctx) d.endpoint = nil } } @@ -341,8 +342,8 @@ type tunEndpoint struct { } // DecRef decrements refcount of e, removes NIC if refcount goes to 0. -func (e *tunEndpoint) DecRef() { - e.DecRefWithDestructor(func() { +func (e *tunEndpoint) DecRef(ctx context.Context) { + e.DecRefWithDestructor(ctx, func(context.Context) { e.stack.RemoveNIC(e.nicID) }) } diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go index 2982450f8..1827666c5 100644 --- a/pkg/tcpip/network/fragmentation/fragmentation.go +++ b/pkg/tcpip/network/fragmentation/fragmentation.go @@ -17,28 +17,58 @@ package fragmentation import ( + "errors" "fmt" "log" "time" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" ) -// DefaultReassembleTimeout is based on the linux stack: net.ipv4.ipfrag_time. -const DefaultReassembleTimeout = 30 * time.Second +const ( + // DefaultReassembleTimeout is based on the linux stack: net.ipv4.ipfrag_time. + DefaultReassembleTimeout = 30 * time.Second -// HighFragThreshold is the threshold at which we start trimming old -// fragmented packets. Linux uses a default value of 4 MB. See -// net.ipv4.ipfrag_high_thresh for more information. -const HighFragThreshold = 4 << 20 // 4MB + // HighFragThreshold is the threshold at which we start trimming old + // fragmented packets. Linux uses a default value of 4 MB. See + // net.ipv4.ipfrag_high_thresh for more information. + HighFragThreshold = 4 << 20 // 4MB -// LowFragThreshold is the threshold we reach to when we start dropping -// older fragmented packets. It's important that we keep enough room for newer -// packets to be re-assembled. Hence, this needs to be lower than -// HighFragThreshold enough. Linux uses a default value of 3 MB. See -// net.ipv4.ipfrag_low_thresh for more information. -const LowFragThreshold = 3 << 20 // 3MB + // LowFragThreshold is the threshold we reach to when we start dropping + // older fragmented packets. It's important that we keep enough room for newer + // packets to be re-assembled. Hence, this needs to be lower than + // HighFragThreshold enough. Linux uses a default value of 3 MB. See + // net.ipv4.ipfrag_low_thresh for more information. + LowFragThreshold = 3 << 20 // 3MB + + // minBlockSize is the minimum block size for fragments. + minBlockSize = 1 +) + +var ( + // ErrInvalidArgs indicates to the caller that that an invalid argument was + // provided. + ErrInvalidArgs = errors.New("invalid args") +) + +// FragmentID is the identifier for a fragment. +type FragmentID struct { + // Source is the source address of the fragment. + Source tcpip.Address + + // Destination is the destination address of the fragment. + Destination tcpip.Address + + // ID is the identification value of the fragment. + // + // This is a uint32 because IPv6 uses a 32-bit identification value. + ID uint32 + + // The protocol for the packet. + Protocol uint8 +} // Fragmentation is the main structure that other modules // of the stack should use to implement IP Fragmentation. @@ -46,14 +76,17 @@ type Fragmentation struct { mu sync.Mutex highLimit int lowLimit int - reassemblers map[uint32]*reassembler + reassemblers map[FragmentID]*reassembler rList reassemblerList size int timeout time.Duration + blockSize uint16 } // NewFragmentation creates a new Fragmentation. // +// blockSize specifies the fragment block size, in bytes. +// // highMemoryLimit specifies the limit on the memory consumed // by the fragments stored by Fragmentation (overhead of internal data-structures // is not accounted). Fragments are dropped when the limit is reached. @@ -64,7 +97,7 @@ type Fragmentation struct { // reassemblingTimeout specifies the maximum time allowed to reassemble a packet. // Fragments are lazily evicted only when a new a packet with an // already existing fragmentation-id arrives after the timeout. -func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration) *Fragmentation { +func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration) *Fragmentation { if lowMemoryLimit >= highMemoryLimit { lowMemoryLimit = highMemoryLimit } @@ -73,17 +106,46 @@ func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout t lowMemoryLimit = 0 } + if blockSize < minBlockSize { + blockSize = minBlockSize + } + return &Fragmentation{ - reassemblers: make(map[uint32]*reassembler), + reassemblers: make(map[FragmentID]*reassembler), highLimit: highMemoryLimit, lowLimit: lowMemoryLimit, timeout: reassemblingTimeout, + blockSize: blockSize, } } // Process processes an incoming fragment belonging to an ID and returns a // complete packet when all the packets belonging to that ID have been received. -func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) { +// +// [first, last] is the range of the fragment bytes. +// +// first must be a multiple of the block size f is configured with. The size +// of the fragment data must be a multiple of the block size, unless there are +// no fragments following this fragment (more set to false). +func (f *Fragmentation) Process(id FragmentID, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) { + if first > last { + return buffer.VectorisedView{}, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs) + } + + if first%f.blockSize != 0 { + return buffer.VectorisedView{}, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs) + } + + fragmentSize := last - first + 1 + if more && fragmentSize%f.blockSize != 0 { + return buffer.VectorisedView{}, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs) + } + + if l := vv.Size(); l < int(fragmentSize) { + return buffer.VectorisedView{}, false, fmt.Errorf("got fragment size=%d bytes less than the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs) + } + vv.CapLength(int(fragmentSize)) + f.mu.Lock() r, ok := f.reassemblers[id] if ok && r.tooOld(f.timeout) { diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go index 72c0f53be..9eedd33c4 100644 --- a/pkg/tcpip/network/fragmentation/fragmentation_test.go +++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go @@ -15,6 +15,7 @@ package fragmentation import ( + "errors" "reflect" "testing" "time" @@ -33,7 +34,7 @@ func vv(size int, pieces ...string) buffer.VectorisedView { } type processInput struct { - id uint32 + id FragmentID first uint16 last uint16 more bool @@ -53,8 +54,8 @@ var processTestCases = []struct { { comment: "One ID", in: []processInput{ - {id: 0, first: 0, last: 1, more: true, vv: vv(2, "01")}, - {id: 0, first: 2, last: 3, more: false, vv: vv(2, "23")}, + {id: FragmentID{ID: 0}, first: 0, last: 1, more: true, vv: vv(2, "01")}, + {id: FragmentID{ID: 0}, first: 2, last: 3, more: false, vv: vv(2, "23")}, }, out: []processOutput{ {vv: buffer.VectorisedView{}, done: false}, @@ -64,10 +65,10 @@ var processTestCases = []struct { { comment: "Two IDs", in: []processInput{ - {id: 0, first: 0, last: 1, more: true, vv: vv(2, "01")}, - {id: 1, first: 0, last: 1, more: true, vv: vv(2, "ab")}, - {id: 1, first: 2, last: 3, more: false, vv: vv(2, "cd")}, - {id: 0, first: 2, last: 3, more: false, vv: vv(2, "23")}, + {id: FragmentID{ID: 0}, first: 0, last: 1, more: true, vv: vv(2, "01")}, + {id: FragmentID{ID: 1}, first: 0, last: 1, more: true, vv: vv(2, "ab")}, + {id: FragmentID{ID: 1}, first: 2, last: 3, more: false, vv: vv(2, "cd")}, + {id: FragmentID{ID: 0}, first: 2, last: 3, more: false, vv: vv(2, "23")}, }, out: []processOutput{ {vv: buffer.VectorisedView{}, done: false}, @@ -81,7 +82,7 @@ var processTestCases = []struct { func TestFragmentationProcess(t *testing.T) { for _, c := range processTestCases { t.Run(c.comment, func(t *testing.T) { - f := NewFragmentation(1024, 512, DefaultReassembleTimeout) + f := NewFragmentation(minBlockSize, 1024, 512, DefaultReassembleTimeout) for i, in := range c.in { vv, done, err := f.Process(in.id, in.first, in.last, in.more, in.vv) if err != nil { @@ -110,14 +111,14 @@ func TestFragmentationProcess(t *testing.T) { func TestReassemblingTimeout(t *testing.T) { timeout := time.Millisecond - f := NewFragmentation(1024, 512, timeout) + f := NewFragmentation(minBlockSize, 1024, 512, timeout) // Send first fragment with id = 0, first = 0, last = 0, and more = true. - f.Process(0, 0, 0, true, vv(1, "0")) + f.Process(FragmentID{}, 0, 0, true, vv(1, "0")) // Sleep more than the timeout. time.Sleep(2 * timeout) // Send another fragment that completes a packet. // However, no packet should be reassembled because the fragment arrived after the timeout. - _, done, err := f.Process(0, 1, 1, false, vv(1, "1")) + _, done, err := f.Process(FragmentID{}, 1, 1, false, vv(1, "1")) if err != nil { t.Fatalf("f.Process(0, 1, 1, false, vv(1, \"1\")) failed: %v", err) } @@ -127,35 +128,35 @@ func TestReassemblingTimeout(t *testing.T) { } func TestMemoryLimits(t *testing.T) { - f := NewFragmentation(3, 1, DefaultReassembleTimeout) + f := NewFragmentation(minBlockSize, 3, 1, DefaultReassembleTimeout) // Send first fragment with id = 0. - f.Process(0, 0, 0, true, vv(1, "0")) + f.Process(FragmentID{ID: 0}, 0, 0, true, vv(1, "0")) // Send first fragment with id = 1. - f.Process(1, 0, 0, true, vv(1, "1")) + f.Process(FragmentID{ID: 1}, 0, 0, true, vv(1, "1")) // Send first fragment with id = 2. - f.Process(2, 0, 0, true, vv(1, "2")) + f.Process(FragmentID{ID: 2}, 0, 0, true, vv(1, "2")) // Send first fragment with id = 3. This should caused id = 0 and id = 1 to be // evicted. - f.Process(3, 0, 0, true, vv(1, "3")) + f.Process(FragmentID{ID: 3}, 0, 0, true, vv(1, "3")) - if _, ok := f.reassemblers[0]; ok { + if _, ok := f.reassemblers[FragmentID{ID: 0}]; ok { t.Errorf("Memory limits are not respected: id=0 has not been evicted.") } - if _, ok := f.reassemblers[1]; ok { + if _, ok := f.reassemblers[FragmentID{ID: 1}]; ok { t.Errorf("Memory limits are not respected: id=1 has not been evicted.") } - if _, ok := f.reassemblers[3]; !ok { + if _, ok := f.reassemblers[FragmentID{ID: 3}]; !ok { t.Errorf("Implementation of memory limits is wrong: id=3 is not present.") } } func TestMemoryLimitsIgnoresDuplicates(t *testing.T) { - f := NewFragmentation(1, 0, DefaultReassembleTimeout) + f := NewFragmentation(minBlockSize, 1, 0, DefaultReassembleTimeout) // Send first fragment with id = 0. - f.Process(0, 0, 0, true, vv(1, "0")) + f.Process(FragmentID{}, 0, 0, true, vv(1, "0")) // Send the same packet again. - f.Process(0, 0, 0, true, vv(1, "0")) + f.Process(FragmentID{}, 0, 0, true, vv(1, "0")) got := f.size want := 1 @@ -163,3 +164,97 @@ func TestMemoryLimitsIgnoresDuplicates(t *testing.T) { t.Errorf("Wrong size, duplicates are not handled correctly: got=%d, want=%d.", got, want) } } + +func TestErrors(t *testing.T) { + tests := []struct { + name string + blockSize uint16 + first uint16 + last uint16 + more bool + data string + err error + }{ + { + name: "exact block size without more", + blockSize: 2, + first: 2, + last: 3, + more: false, + data: "01", + }, + { + name: "exact block size with more", + blockSize: 2, + first: 2, + last: 3, + more: true, + data: "01", + }, + { + name: "exact block size with more and extra data", + blockSize: 2, + first: 2, + last: 3, + more: true, + data: "012", + }, + { + name: "exact block size with more and too little data", + blockSize: 2, + first: 2, + last: 3, + more: true, + data: "0", + err: ErrInvalidArgs, + }, + { + name: "not exact block size with more", + blockSize: 2, + first: 2, + last: 2, + more: true, + data: "0", + err: ErrInvalidArgs, + }, + { + name: "not exact block size without more", + blockSize: 2, + first: 2, + last: 2, + more: false, + data: "0", + }, + { + name: "first not a multiple of block size", + blockSize: 2, + first: 3, + last: 4, + more: true, + data: "01", + err: ErrInvalidArgs, + }, + { + name: "first more than last", + blockSize: 2, + first: 4, + last: 3, + more: true, + data: "01", + err: ErrInvalidArgs, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + f := NewFragmentation(test.blockSize, HighFragThreshold, LowFragThreshold, DefaultReassembleTimeout) + _, done, err := f.Process(FragmentID{}, test.first, test.last, test.more, vv(len(test.data), test.data)) + if !errors.Is(err, test.err) { + t.Errorf("got Proceess(_, %d, %d, %t, %q) = (_, _, %v), want = (_, _, %v)", test.first, test.last, test.more, test.data, err, test.err) + } + if done { + t.Errorf("got Proceess(_, %d, %d, %t, %q) = (_, true, _), want = (_, false, _)", test.first, test.last, test.more, test.data) + } + }) + } +} diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go index 0a83d81f2..50d30bbf0 100644 --- a/pkg/tcpip/network/fragmentation/reassembler.go +++ b/pkg/tcpip/network/fragmentation/reassembler.go @@ -32,7 +32,7 @@ type hole struct { type reassembler struct { reassemblerEntry - id uint32 + id FragmentID size int mu sync.Mutex holes []hole @@ -42,7 +42,7 @@ type reassembler struct { creationTime time.Time } -func newReassembler(id uint32) *reassembler { +func newReassembler(id FragmentID) *reassembler { r := &reassembler{ id: id, holes: make([]hole, 0, 16), diff --git a/pkg/tcpip/network/fragmentation/reassembler_test.go b/pkg/tcpip/network/fragmentation/reassembler_test.go index 7eee0710d..dff7c9dcb 100644 --- a/pkg/tcpip/network/fragmentation/reassembler_test.go +++ b/pkg/tcpip/network/fragmentation/reassembler_test.go @@ -94,7 +94,7 @@ var holesTestCases = []struct { func TestUpdateHoles(t *testing.T) { for _, c := range holesTestCases { - r := newReassembler(0) + r := newReassembler(FragmentID{}) for _, i := range c.in { r.updateHoles(i.first, i.last, i.more) } diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index b1776e5ee..6c4f0ae3e 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -45,30 +45,32 @@ const ( // buckets is the number of identifier buckets. buckets = 2048 + + // The size of a fragment block, in bytes, as per RFC 791 section 3.1, + // page 14. + fragmentblockSize = 8 ) type endpoint struct { - nicID tcpip.NICID - id stack.NetworkEndpointID - prefixLen int - linkEP stack.LinkEndpoint - dispatcher stack.TransportDispatcher - fragmentation *fragmentation.Fragmentation - protocol *protocol - stack *stack.Stack + nicID tcpip.NICID + id stack.NetworkEndpointID + prefixLen int + linkEP stack.LinkEndpoint + dispatcher stack.TransportDispatcher + protocol *protocol + stack *stack.Stack } // NewEndpoint creates a new ipv4 endpoint. func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) { e := &endpoint{ - nicID: nicID, - id: stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address}, - prefixLen: addrWithPrefix.PrefixLen, - linkEP: linkEP, - dispatcher: dispatcher, - fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout), - protocol: p, - stack: st, + nicID: nicID, + id: stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address}, + prefixLen: addrWithPrefix.PrefixLen, + linkEP: linkEP, + dispatcher: dispatcher, + protocol: p, + stack: st, } return e, nil @@ -438,7 +440,20 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) { } var ready bool var err error - pkt.Data, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, h.More(), pkt.Data) + pkt.Data, ready, err = e.protocol.fragmentation.Process( + // As per RFC 791 section 2.3, the identification value is unique + // for a source-destination pair and protocol. + fragmentation.FragmentID{ + Source: h.SourceAddress(), + Destination: h.DestinationAddress(), + ID: uint32(h.ID()), + Protocol: h.Protocol(), + }, + h.FragmentOffset(), + last, + h.More(), + pkt.Data, + ) if err != nil { r.Stats().IP.MalformedPacketsReceived.Increment() r.Stats().IP.MalformedFragmentsReceived.Increment() @@ -469,6 +484,8 @@ type protocol struct { // uint8 portion of it is meaningful and it must be accessed // atomically. defaultTTL uint32 + + fragmentation *fragmentation.Fragmentation } // Number returns the ipv4 protocol number. @@ -590,5 +607,10 @@ func NewProtocol() stack.NetworkProtocol { } hashIV := r[buckets] - return &protocol{ids: ids, hashIV: hashIV, defaultTTL: DefaultTTL} + return &protocol{ + ids: ids, + hashIV: hashIV, + defaultTTL: DefaultTTL, + fragmentation: fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout), + } } diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go index 11e579c4b..ded97ac64 100644 --- a/pkg/tcpip/network/ipv4/ipv4_test.go +++ b/pkg/tcpip/network/ipv4/ipv4_test.go @@ -486,12 +486,16 @@ func TestInvalidFragments(t *testing.T) { // TestReceiveFragments feeds fragments in through the incoming packet path to // test reassembly func TestReceiveFragments(t *testing.T) { - const addr1 = "\x0c\xa8\x00\x01" // 192.168.0.1 - const addr2 = "\x0c\xa8\x00\x02" // 192.168.0.2 - const nicID = 1 + const ( + nicID = 1 + + addr1 = "\x0c\xa8\x00\x01" // 192.168.0.1 + addr2 = "\x0c\xa8\x00\x02" // 192.168.0.2 + addr3 = "\x0c\xa8\x00\x03" // 192.168.0.3 + ) // Build and return a UDP header containing payload. - udpGen := func(payloadLen int, multiplier uint8) buffer.View { + udpGen := func(payloadLen int, multiplier uint8, src, dst tcpip.Address) buffer.View { payload := buffer.NewView(payloadLen) for i := 0; i < len(payload); i++ { payload[i] = uint8(i) * multiplier @@ -507,20 +511,29 @@ func TestReceiveFragments(t *testing.T) { Length: uint16(udpLength), }) copy(u.Payload(), payload) - sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength)) + sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, src, dst, uint16(udpLength)) sum = header.Checksum(payload, sum) u.SetChecksum(^u.CalculateChecksum(sum)) return hdr.View() } // UDP header plus a payload of 0..256 - ipv4Payload1 := udpGen(256, 1) - udpPayload1 := ipv4Payload1[header.UDPMinimumSize:] + ipv4Payload1Addr1ToAddr2 := udpGen(256, 1, addr1, addr2) + udpPayload1Addr1ToAddr2 := ipv4Payload1Addr1ToAddr2[header.UDPMinimumSize:] + ipv4Payload1Addr3ToAddr2 := udpGen(256, 1, addr3, addr2) + udpPayload1Addr3ToAddr2 := ipv4Payload1Addr3ToAddr2[header.UDPMinimumSize:] // UDP header plus a payload of 0..256 in increments of 2. - ipv4Payload2 := udpGen(128, 2) - udpPayload2 := ipv4Payload2[header.UDPMinimumSize:] + ipv4Payload2Addr1ToAddr2 := udpGen(128, 2, addr1, addr2) + udpPayload2Addr1ToAddr2 := ipv4Payload2Addr1ToAddr2[header.UDPMinimumSize:] + // UDP header plus a payload of 0..256 in increments of 3. + // Used to test cases where the fragment blocks are not a multiple of + // the fragment block size of 8 (RFC 791 section 3.1 page 14). + ipv4Payload3Addr1ToAddr2 := udpGen(127, 3, addr1, addr2) + udpPayload3Addr1ToAddr2 := ipv4Payload3Addr1ToAddr2[header.UDPMinimumSize:] type fragmentData struct { + srcAddr tcpip.Address + dstAddr tcpip.Address id uint16 flags uint8 fragmentOffset uint16 @@ -536,22 +549,40 @@ func TestReceiveFragments(t *testing.T) { name: "No fragmentation", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, + id: 1, + flags: 0, + fragmentOffset: 0, + payload: ipv4Payload1Addr1ToAddr2, + }, + }, + expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2}, + }, + { + name: "No fragmentation with size not a multiple of fragment block size", + fragments: []fragmentData{ + { + srcAddr: addr1, + dstAddr: addr2, id: 1, flags: 0, fragmentOffset: 0, - payload: ipv4Payload1, + payload: ipv4Payload3Addr1ToAddr2, }, }, - expectedPayloads: [][]byte{udpPayload1}, + expectedPayloads: [][]byte{udpPayload3Addr1ToAddr2}, }, { name: "More fragments without payload", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, id: 1, flags: header.IPv4FlagMoreFragments, fragmentOffset: 0, - payload: ipv4Payload1, + payload: ipv4Payload1Addr1ToAddr2, }, }, expectedPayloads: nil, @@ -560,10 +591,12 @@ func TestReceiveFragments(t *testing.T) { name: "Non-zero fragment offset without payload", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, id: 1, flags: 0, fragmentOffset: 8, - payload: ipv4Payload1, + payload: ipv4Payload1Addr1ToAddr2, }, }, expectedPayloads: nil, @@ -572,34 +605,86 @@ func TestReceiveFragments(t *testing.T) { name: "Two fragments", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 0, + payload: ipv4Payload1Addr1ToAddr2[:64], + }, + { + srcAddr: addr1, + dstAddr: addr2, + id: 1, + flags: 0, + fragmentOffset: 64, + payload: ipv4Payload1Addr1ToAddr2[64:], + }, + }, + expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2}, + }, + { + name: "Two fragments with last fragment size not a multiple of fragment block size", + fragments: []fragmentData{ + { + srcAddr: addr1, + dstAddr: addr2, id: 1, flags: header.IPv4FlagMoreFragments, fragmentOffset: 0, - payload: ipv4Payload1[:64], + payload: ipv4Payload3Addr1ToAddr2[:64], }, { + srcAddr: addr1, + dstAddr: addr2, id: 1, flags: 0, fragmentOffset: 64, - payload: ipv4Payload1[64:], + payload: ipv4Payload3Addr1ToAddr2[64:], }, }, - expectedPayloads: [][]byte{udpPayload1}, + expectedPayloads: [][]byte{udpPayload3Addr1ToAddr2}, + }, + { + name: "Two fragments with first fragment size not a multiple of fragment block size", + fragments: []fragmentData{ + { + srcAddr: addr1, + dstAddr: addr2, + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 0, + payload: ipv4Payload3Addr1ToAddr2[:63], + }, + { + srcAddr: addr1, + dstAddr: addr2, + id: 1, + flags: 0, + fragmentOffset: 63, + payload: ipv4Payload3Addr1ToAddr2[63:], + }, + }, + expectedPayloads: nil, }, { name: "Second fragment has MoreFlags set", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, id: 1, flags: header.IPv4FlagMoreFragments, fragmentOffset: 0, - payload: ipv4Payload1[:64], + payload: ipv4Payload1Addr1ToAddr2[:64], }, { + srcAddr: addr1, + dstAddr: addr2, id: 1, flags: header.IPv4FlagMoreFragments, fragmentOffset: 64, - payload: ipv4Payload1[64:], + payload: ipv4Payload1Addr1ToAddr2[64:], }, }, expectedPayloads: nil, @@ -608,16 +693,20 @@ func TestReceiveFragments(t *testing.T) { name: "Two fragments with different IDs", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, id: 1, flags: header.IPv4FlagMoreFragments, fragmentOffset: 0, - payload: ipv4Payload1[:64], + payload: ipv4Payload1Addr1ToAddr2[:64], }, { + srcAddr: addr1, + dstAddr: addr2, id: 2, flags: 0, fragmentOffset: 64, - payload: ipv4Payload1[64:], + payload: ipv4Payload1Addr1ToAddr2[64:], }, }, expectedPayloads: nil, @@ -626,40 +715,88 @@ func TestReceiveFragments(t *testing.T) { name: "Two interleaved fragmented packets", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, id: 1, flags: header.IPv4FlagMoreFragments, fragmentOffset: 0, - payload: ipv4Payload1[:64], + payload: ipv4Payload1Addr1ToAddr2[:64], }, { + srcAddr: addr1, + dstAddr: addr2, id: 2, flags: header.IPv4FlagMoreFragments, fragmentOffset: 0, - payload: ipv4Payload2[:64], + payload: ipv4Payload2Addr1ToAddr2[:64], }, { + srcAddr: addr1, + dstAddr: addr2, id: 1, flags: 0, fragmentOffset: 64, - payload: ipv4Payload1[64:], + payload: ipv4Payload1Addr1ToAddr2[64:], }, { + srcAddr: addr1, + dstAddr: addr2, id: 2, flags: 0, fragmentOffset: 64, - payload: ipv4Payload2[64:], + payload: ipv4Payload2Addr1ToAddr2[64:], + }, + }, + expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2, udpPayload2Addr1ToAddr2}, + }, + { + name: "Two interleaved fragmented packets from different sources but with same ID", + fragments: []fragmentData{ + { + srcAddr: addr1, + dstAddr: addr2, + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 0, + payload: ipv4Payload1Addr1ToAddr2[:64], + }, + { + srcAddr: addr3, + dstAddr: addr2, + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 0, + payload: ipv4Payload1Addr3ToAddr2[:32], + }, + { + srcAddr: addr1, + dstAddr: addr2, + id: 1, + flags: 0, + fragmentOffset: 64, + payload: ipv4Payload1Addr1ToAddr2[64:], + }, + { + srcAddr: addr3, + dstAddr: addr2, + id: 1, + flags: 0, + fragmentOffset: 32, + payload: ipv4Payload1Addr3ToAddr2[32:], }, }, - expectedPayloads: [][]byte{udpPayload1, udpPayload2}, + expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2, udpPayload1Addr3ToAddr2}, }, { name: "Fragment without followup", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, id: 1, flags: header.IPv4FlagMoreFragments, fragmentOffset: 0, - payload: ipv4Payload1[:64], + payload: ipv4Payload1Addr1ToAddr2[:64], }, }, expectedPayloads: nil, @@ -711,8 +848,8 @@ func TestReceiveFragments(t *testing.T) { FragmentOffset: frag.fragmentOffset, TTL: 64, Protocol: uint8(header.UDPProtocolNumber), - SrcAddr: addr1, - DstAddr: addr2, + SrcAddr: frag.srcAddr, + DstAddr: frag.dstAddr, }) vv := hdr.View().ToVectorisedView() diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD index feada63dc..bcc64994e 100644 --- a/pkg/tcpip/network/ipv6/BUILD +++ b/pkg/tcpip/network/ipv6/BUILD @@ -14,7 +14,6 @@ go_library( "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/network/fragmentation", - "//pkg/tcpip/network/hash", "//pkg/tcpip/stack", ], ) diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index 95fbcf2d1..4a0b53c45 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/network/fragmentation" - "gvisor.dev/gvisor/pkg/tcpip/network/hash" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -52,7 +51,6 @@ type endpoint struct { linkEP stack.LinkEndpoint linkAddrCache stack.LinkAddressCache dispatcher stack.TransportDispatcher - fragmentation *fragmentation.Fragmentation protocol *protocol } @@ -343,7 +341,19 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) { var ready bool // Note that pkt doesn't have its transport header set after reassembly, // and won't until DeliverNetworkPacket sets it. - pkt.Data, ready, err = e.fragmentation.Process(hash.IPv6FragmentHash(h, extHdr.ID()), start, last, extHdr.More(), rawPayload.Buf) + pkt.Data, ready, err = e.protocol.fragmentation.Process( + // IPv6 ignores the Protocol field since the ID only needs to be unique + // across source-destination pairs, as per RFC 8200 section 4.5. + fragmentation.FragmentID{ + Source: h.SourceAddress(), + Destination: h.DestinationAddress(), + ID: extHdr.ID(), + }, + start, + last, + extHdr.More(), + rawPayload.Buf, + ) if err != nil { r.Stats().IP.MalformedPacketsReceived.Increment() r.Stats().IP.MalformedFragmentsReceived.Increment() @@ -434,7 +444,8 @@ type protocol struct { // defaultTTL is the current default TTL for the protocol. Only the // uint8 portion of it is meaningful and it must be accessed // atomically. - defaultTTL uint32 + defaultTTL uint32 + fragmentation *fragmentation.Fragmentation } // Number returns the ipv6 protocol number. @@ -467,7 +478,6 @@ func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWi linkEP: linkEP, linkAddrCache: linkAddrCache, dispatcher: dispatcher, - fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout), protocol: p, }, nil } @@ -595,5 +605,8 @@ func calculateMTU(mtu uint32) uint32 { // NewProtocol returns an IPv6 network protocol. func NewProtocol() stack.NetworkProtocol { - return &protocol{defaultTTL: DefaultTTL} + return &protocol{ + defaultTTL: DefaultTTL, + fragmentation: fragmentation.NewFragmentation(header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout), + } } diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go index 213ff64f2..3d65814de 100644 --- a/pkg/tcpip/network/ipv6/ipv6_test.go +++ b/pkg/tcpip/network/ipv6/ipv6_test.go @@ -673,20 +673,27 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) { // fragmentData holds the IPv6 payload for a fragmented IPv6 packet. type fragmentData struct { + srcAddr tcpip.Address + dstAddr tcpip.Address nextHdr uint8 data buffer.VectorisedView } func TestReceiveIPv6Fragments(t *testing.T) { - const nicID = 1 - const udpPayload1Length = 256 - const udpPayload2Length = 128 - const fragmentExtHdrLen = 8 - // Note, not all routing extension headers will be 8 bytes but this test - // uses 8 byte routing extension headers for most sub tests. - const routingExtHdrLen = 8 - - udpGen := func(payload []byte, multiplier uint8) buffer.View { + const ( + nicID = 1 + udpPayload1Length = 256 + udpPayload2Length = 128 + // Used to test cases where the fragment blocks are not a multiple of + // the fragment block size of 8 (RFC 8200 section 4.5). + udpPayload3Length = 127 + fragmentExtHdrLen = 8 + // Note, not all routing extension headers will be 8 bytes but this test + // uses 8 byte routing extension headers for most sub tests. + routingExtHdrLen = 8 + ) + + udpGen := func(payload []byte, multiplier uint8, src, dst tcpip.Address) buffer.View { payloadLen := len(payload) for i := 0; i < payloadLen; i++ { payload[i] = uint8(i) * multiplier @@ -702,19 +709,27 @@ func TestReceiveIPv6Fragments(t *testing.T) { Length: uint16(udpLength), }) copy(u.Payload(), payload) - sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength)) + sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, src, dst, uint16(udpLength)) sum = header.Checksum(payload, sum) u.SetChecksum(^u.CalculateChecksum(sum)) return hdr.View() } - var udpPayload1Buf [udpPayload1Length]byte - udpPayload1 := udpPayload1Buf[:] - ipv6Payload1 := udpGen(udpPayload1, 1) + var udpPayload1Addr1ToAddr2Buf [udpPayload1Length]byte + udpPayload1Addr1ToAddr2 := udpPayload1Addr1ToAddr2Buf[:] + ipv6Payload1Addr1ToAddr2 := udpGen(udpPayload1Addr1ToAddr2, 1, addr1, addr2) + + var udpPayload1Addr3ToAddr2Buf [udpPayload1Length]byte + udpPayload1Addr3ToAddr2 := udpPayload1Addr3ToAddr2Buf[:] + ipv6Payload1Addr3ToAddr2 := udpGen(udpPayload1Addr3ToAddr2, 4, addr3, addr2) - var udpPayload2Buf [udpPayload2Length]byte - udpPayload2 := udpPayload2Buf[:] - ipv6Payload2 := udpGen(udpPayload2, 2) + var udpPayload2Addr1ToAddr2Buf [udpPayload2Length]byte + udpPayload2Addr1ToAddr2 := udpPayload2Addr1ToAddr2Buf[:] + ipv6Payload2Addr1ToAddr2 := udpGen(udpPayload2Addr1ToAddr2, 2, addr1, addr2) + + var udpPayload3Addr1ToAddr2Buf [udpPayload3Length]byte + udpPayload3Addr1ToAddr2 := udpPayload3Addr1ToAddr2Buf[:] + ipv6Payload3Addr1ToAddr2 := udpGen(udpPayload3Addr1ToAddr2, 3, addr1, addr2) tests := []struct { name string @@ -726,34 +741,98 @@ func TestReceiveIPv6Fragments(t *testing.T) { name: "No fragmentation", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, nextHdr: uint8(header.UDPProtocolNumber), - data: ipv6Payload1.ToVectorisedView(), + data: ipv6Payload1Addr1ToAddr2.ToVectorisedView(), }, }, - expectedPayloads: [][]byte{udpPayload1}, + expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2}, }, { name: "Atomic fragment", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, + nextHdr: fragmentExtHdrID, + data: buffer.NewVectorisedView( + fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2), + []buffer.View{ + // Fragment extension header. + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 0}), + + ipv6Payload1Addr1ToAddr2, + }, + ), + }, + }, + expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2}, + }, + { + name: "Atomic fragment with size not a multiple of fragment block size", + fragments: []fragmentData{ + { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( - fragmentExtHdrLen+len(ipv6Payload1), + fragmentExtHdrLen+len(ipv6Payload3Addr1ToAddr2), []buffer.View{ // Fragment extension header. buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 0}), - ipv6Payload1, + ipv6Payload3Addr1ToAddr2, }, ), }, }, - expectedPayloads: [][]byte{udpPayload1}, + expectedPayloads: [][]byte{udpPayload3Addr1ToAddr2}, }, { name: "Two fragments", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, + nextHdr: fragmentExtHdrID, + data: buffer.NewVectorisedView( + fragmentExtHdrLen+64, + []buffer.View{ + // Fragment extension header. + // + // Fragment offset = 0, More = true, ID = 1 + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), + + ipv6Payload1Addr1ToAddr2[:64], + }, + ), + }, + { + srcAddr: addr1, + dstAddr: addr2, + nextHdr: fragmentExtHdrID, + data: buffer.NewVectorisedView( + fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64, + []buffer.View{ + // Fragment extension header. + // + // Fragment offset = 8, More = false, ID = 1 + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}), + + ipv6Payload1Addr1ToAddr2[64:], + }, + ), + }, + }, + expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2}, + }, + { + name: "Two fragments with last fragment size not a multiple of fragment block size", + fragments: []fragmentData{ + { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( fragmentExtHdrLen+64, @@ -763,31 +842,73 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Fragment offset = 0, More = true, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), - ipv6Payload1[:64], + ipv6Payload3Addr1ToAddr2[:64], }, ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( - fragmentExtHdrLen+len(ipv6Payload1)-64, + fragmentExtHdrLen+len(ipv6Payload3Addr1ToAddr2)-64, []buffer.View{ // Fragment extension header. // // Fragment offset = 8, More = false, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}), - ipv6Payload1[64:], + ipv6Payload3Addr1ToAddr2[64:], }, ), }, }, - expectedPayloads: [][]byte{udpPayload1}, + expectedPayloads: [][]byte{udpPayload3Addr1ToAddr2}, + }, + { + name: "Two fragments with first fragment size not a multiple of fragment block size", + fragments: []fragmentData{ + { + srcAddr: addr1, + dstAddr: addr2, + nextHdr: fragmentExtHdrID, + data: buffer.NewVectorisedView( + fragmentExtHdrLen+63, + []buffer.View{ + // Fragment extension header. + // + // Fragment offset = 0, More = true, ID = 1 + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), + + ipv6Payload3Addr1ToAddr2[:63], + }, + ), + }, + { + srcAddr: addr1, + dstAddr: addr2, + nextHdr: fragmentExtHdrID, + data: buffer.NewVectorisedView( + fragmentExtHdrLen+len(ipv6Payload3Addr1ToAddr2)-63, + []buffer.View{ + // Fragment extension header. + // + // Fragment offset = 8, More = false, ID = 1 + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}), + + ipv6Payload3Addr1ToAddr2[63:], + }, + ), + }, + }, + expectedPayloads: nil, }, { name: "Two fragments with different IDs", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( fragmentExtHdrLen+64, @@ -797,21 +918,23 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Fragment offset = 0, More = true, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), - ipv6Payload1[:64], + ipv6Payload1Addr1ToAddr2[:64], }, ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( - fragmentExtHdrLen+len(ipv6Payload1)-64, + fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64, []buffer.View{ // Fragment extension header. // // Fragment offset = 8, More = false, ID = 2 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 2}), - ipv6Payload1[64:], + ipv6Payload1Addr1ToAddr2[64:], }, ), }, @@ -822,6 +945,8 @@ func TestReceiveIPv6Fragments(t *testing.T) { name: "Two fragments with per-fragment routing header with zero segments left", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, nextHdr: routingExtHdrID, data: buffer.NewVectorisedView( routingExtHdrLen+fragmentExtHdrLen+64, @@ -836,14 +961,16 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Fragment offset = 0, More = true, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), - ipv6Payload1[:64], + ipv6Payload1Addr1ToAddr2[:64], }, ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: routingExtHdrID, data: buffer.NewVectorisedView( - routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1)-64, + routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64, []buffer.View{ // Routing extension header. // @@ -855,17 +982,19 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Fragment offset = 8, More = false, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}), - ipv6Payload1[64:], + ipv6Payload1Addr1ToAddr2[64:], }, ), }, }, - expectedPayloads: [][]byte{udpPayload1}, + expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2}, }, { name: "Two fragments with per-fragment routing header with non-zero segments left", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, nextHdr: routingExtHdrID, data: buffer.NewVectorisedView( routingExtHdrLen+fragmentExtHdrLen+64, @@ -880,14 +1009,16 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Fragment offset = 0, More = true, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), - ipv6Payload1[:64], + ipv6Payload1Addr1ToAddr2[:64], }, ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: routingExtHdrID, data: buffer.NewVectorisedView( - routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1)-64, + routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64, []buffer.View{ // Routing extension header. // @@ -899,7 +1030,7 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Fragment offset = 9, More = false, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 72, 0, 0, 0, 1}), - ipv6Payload1[64:], + ipv6Payload1Addr1ToAddr2[64:], }, ), }, @@ -910,6 +1041,8 @@ func TestReceiveIPv6Fragments(t *testing.T) { name: "Two fragments with routing header with zero segments left", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( routingExtHdrLen+fragmentExtHdrLen+64, @@ -924,31 +1057,35 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Segments left = 0. buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 1, 0, 2, 3, 4, 5}), - ipv6Payload1[:64], + ipv6Payload1Addr1ToAddr2[:64], }, ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( - fragmentExtHdrLen+len(ipv6Payload1)-64, + fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64, []buffer.View{ // Fragment extension header. // // Fragment offset = 9, More = false, ID = 1 buffer.View([]byte{routingExtHdrID, 0, 0, 72, 0, 0, 0, 1}), - ipv6Payload1[64:], + ipv6Payload1Addr1ToAddr2[64:], }, ), }, }, - expectedPayloads: [][]byte{udpPayload1}, + expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2}, }, { name: "Two fragments with routing header with non-zero segments left", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( routingExtHdrLen+fragmentExtHdrLen+64, @@ -963,21 +1100,23 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Segments left = 1. buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 1, 1, 2, 3, 4, 5}), - ipv6Payload1[:64], + ipv6Payload1Addr1ToAddr2[:64], }, ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( - fragmentExtHdrLen+len(ipv6Payload1)-64, + fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64, []buffer.View{ // Fragment extension header. // // Fragment offset = 9, More = false, ID = 1 buffer.View([]byte{routingExtHdrID, 0, 0, 72, 0, 0, 0, 1}), - ipv6Payload1[64:], + ipv6Payload1Addr1ToAddr2[64:], }, ), }, @@ -988,6 +1127,8 @@ func TestReceiveIPv6Fragments(t *testing.T) { name: "Two fragments with routing header with zero segments left across fragments", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( // The length of this payload is fragmentExtHdrLen+8 because the @@ -1008,12 +1149,14 @@ func TestReceiveIPv6Fragments(t *testing.T) { ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( // The length of this payload is - // fragmentExtHdrLen+8+len(ipv6Payload1) because the last 8 bytes of + // fragmentExtHdrLen+8+len(ipv6Payload1Addr1ToAddr2) because the last 8 bytes of // the 16 byte routing extension header is in this fagment. - fragmentExtHdrLen+8+len(ipv6Payload1), + fragmentExtHdrLen+8+len(ipv6Payload1Addr1ToAddr2), []buffer.View{ // Fragment extension header. // @@ -1023,7 +1166,7 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Routing extension header (part 2) buffer.View([]byte{6, 7, 8, 9, 10, 11, 12, 13}), - ipv6Payload1, + ipv6Payload1Addr1ToAddr2, }, ), }, @@ -1034,6 +1177,8 @@ func TestReceiveIPv6Fragments(t *testing.T) { name: "Two fragments with routing header with non-zero segments left across fragments", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( // The length of this payload is fragmentExtHdrLen+8 because the @@ -1054,12 +1199,14 @@ func TestReceiveIPv6Fragments(t *testing.T) { ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( // The length of this payload is - // fragmentExtHdrLen+8+len(ipv6Payload1) because the last 8 bytes of + // fragmentExtHdrLen+8+len(ipv6Payload1Addr1ToAddr2) because the last 8 bytes of // the 16 byte routing extension header is in this fagment. - fragmentExtHdrLen+8+len(ipv6Payload1), + fragmentExtHdrLen+8+len(ipv6Payload1Addr1ToAddr2), []buffer.View{ // Fragment extension header. // @@ -1069,7 +1216,7 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Routing extension header (part 2) buffer.View([]byte{6, 7, 8, 9, 10, 11, 12, 13}), - ipv6Payload1, + ipv6Payload1Addr1ToAddr2, }, ), }, @@ -1082,6 +1229,8 @@ func TestReceiveIPv6Fragments(t *testing.T) { name: "Two fragments with atomic", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( fragmentExtHdrLen+64, @@ -1091,47 +1240,53 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Fragment offset = 0, More = true, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), - ipv6Payload1[:64], + ipv6Payload1Addr1ToAddr2[:64], }, ), }, // This fragment has the same ID as the other fragments but is an atomic // fragment. It should not interfere with the other fragments. { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( - fragmentExtHdrLen+len(ipv6Payload2), + fragmentExtHdrLen+len(ipv6Payload2Addr1ToAddr2), []buffer.View{ // Fragment extension header. // // Fragment offset = 0, More = false, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 1}), - ipv6Payload2, + ipv6Payload2Addr1ToAddr2, }, ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( - fragmentExtHdrLen+len(ipv6Payload1)-64, + fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64, []buffer.View{ // Fragment extension header. // // Fragment offset = 8, More = false, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}), - ipv6Payload1[64:], + ipv6Payload1Addr1ToAddr2[64:], }, ), }, }, - expectedPayloads: [][]byte{udpPayload2, udpPayload1}, + expectedPayloads: [][]byte{udpPayload2Addr1ToAddr2, udpPayload1Addr1ToAddr2}, }, { name: "Two interleaved fragmented packets", fragments: []fragmentData{ { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( fragmentExtHdrLen+64, @@ -1141,11 +1296,13 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Fragment offset = 0, More = true, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), - ipv6Payload1[:64], + ipv6Payload1Addr1ToAddr2[:64], }, ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( fragmentExtHdrLen+32, @@ -1155,40 +1312,114 @@ func TestReceiveIPv6Fragments(t *testing.T) { // Fragment offset = 0, More = true, ID = 2 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 2}), - ipv6Payload2[:32], + ipv6Payload2Addr1ToAddr2[:32], }, ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( - fragmentExtHdrLen+len(ipv6Payload1)-64, + fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64, []buffer.View{ // Fragment extension header. // // Fragment offset = 8, More = false, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}), - ipv6Payload1[64:], + ipv6Payload1Addr1ToAddr2[64:], }, ), }, { + srcAddr: addr1, + dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( - fragmentExtHdrLen+len(ipv6Payload2)-32, + fragmentExtHdrLen+len(ipv6Payload2Addr1ToAddr2)-32, []buffer.View{ // Fragment extension header. // // Fragment offset = 4, More = false, ID = 2 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 32, 0, 0, 0, 2}), - ipv6Payload2[32:], + ipv6Payload2Addr1ToAddr2[32:], + }, + ), + }, + }, + expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2, udpPayload2Addr1ToAddr2}, + }, + { + name: "Two interleaved fragmented packets from different sources but with same ID", + fragments: []fragmentData{ + { + srcAddr: addr1, + dstAddr: addr2, + nextHdr: fragmentExtHdrID, + data: buffer.NewVectorisedView( + fragmentExtHdrLen+64, + []buffer.View{ + // Fragment extension header. + // + // Fragment offset = 0, More = true, ID = 1 + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), + + ipv6Payload1Addr1ToAddr2[:64], + }, + ), + }, + { + srcAddr: addr3, + dstAddr: addr2, + nextHdr: fragmentExtHdrID, + data: buffer.NewVectorisedView( + fragmentExtHdrLen+32, + []buffer.View{ + // Fragment extension header. + // + // Fragment offset = 0, More = true, ID = 1 + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), + + ipv6Payload1Addr3ToAddr2[:32], + }, + ), + }, + { + srcAddr: addr1, + dstAddr: addr2, + nextHdr: fragmentExtHdrID, + data: buffer.NewVectorisedView( + fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64, + []buffer.View{ + // Fragment extension header. + // + // Fragment offset = 8, More = false, ID = 1 + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}), + + ipv6Payload1Addr1ToAddr2[64:], + }, + ), + }, + { + srcAddr: addr3, + dstAddr: addr2, + nextHdr: fragmentExtHdrID, + data: buffer.NewVectorisedView( + fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-32, + []buffer.View{ + // Fragment extension header. + // + // Fragment offset = 4, More = false, ID = 1 + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 32, 0, 0, 0, 1}), + + ipv6Payload1Addr3ToAddr2[32:], }, ), }, }, - expectedPayloads: [][]byte{udpPayload1, udpPayload2}, + expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2, udpPayload1Addr3ToAddr2}, }, } @@ -1231,8 +1462,8 @@ func TestReceiveIPv6Fragments(t *testing.T) { PayloadLength: uint16(f.data.Size()), NextHeader: f.nextHdr, HopLimit: 255, - SrcAddr: addr1, - DstAddr: addr2, + SrcAddr: f.srcAddr, + DstAddr: f.dstAddr, }) vv := hdr.View().ToVectorisedView() diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD index 6b9a6b316..1c58bed2d 100644 --- a/pkg/tcpip/stack/BUILD +++ b/pkg/tcpip/stack/BUILD @@ -16,6 +16,18 @@ go_template_instance( ) go_template_instance( + name = "neighbor_entry_list", + out = "neighbor_entry_list.go", + package = "stack", + prefix = "neighborEntry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*neighborEntry", + "Linker": "*neighborEntry", + }, +) + +go_template_instance( name = "packet_buffer_list", out = "packet_buffer_list.go", package = "stack", @@ -53,7 +65,12 @@ go_library( "linkaddrcache.go", "linkaddrentry_list.go", "ndp.go", + "neighbor_cache.go", + "neighbor_entry.go", + "neighbor_entry_list.go", + "neighborstate_string.go", "nic.go", + "nud.go", "packet_buffer.go", "packet_buffer_list.go", "rand.go", @@ -89,6 +106,7 @@ go_test( size = "medium", srcs = [ "ndp_test.go", + "nud_test.go", "stack_test.go", "transport_demuxer_test.go", "transport_test.go", @@ -110,6 +128,7 @@ go_test( "//pkg/tcpip/transport/udp", "//pkg/waiter", "@com_github_google_go_cmp//cmp:go_default_library", + "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", ], ) @@ -117,8 +136,11 @@ go_test( name = "stack_test", size = "small", srcs = [ + "fake_time_test.go", "forwarder_test.go", "linkaddrcache_test.go", + "neighbor_cache_test.go", + "neighbor_entry_test.go", "nic_test.go", ], library = ":stack", @@ -128,5 +150,8 @@ go_test( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", + "@com_github_dpjacques_clockwork//:go_default_library", + "@com_github_google_go_cmp//cmp:go_default_library", + "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", ], ) diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go index 559a1c4dd..470c265aa 100644 --- a/pkg/tcpip/stack/conntrack.go +++ b/pkg/tcpip/stack/conntrack.go @@ -240,7 +240,10 @@ func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) { if err != nil { return nil, dirOriginal } + return ct.connForTID(tid) +} +func (ct *ConnTrack) connForTID(tid tupleID) (*conn, direction) { bucket := ct.bucket(tid) now := time.Now() @@ -604,3 +607,26 @@ func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bo return true } + +func (ct *ConnTrack) originalDst(epID TransportEndpointID) (tcpip.Address, uint16, *tcpip.Error) { + // Lookup the connection. The reply's original destination + // describes the original address. + tid := tupleID{ + srcAddr: epID.LocalAddress, + srcPort: epID.LocalPort, + dstAddr: epID.RemoteAddress, + dstPort: epID.RemotePort, + transProto: header.TCPProtocolNumber, + netProto: header.IPv4ProtocolNumber, + } + conn, _ := ct.connForTID(tid) + if conn == nil { + // Not a tracked connection. + return "", 0, tcpip.ErrNotConnected + } else if conn.manip == manipNone { + // Unmanipulated connection. + return "", 0, tcpip.ErrInvalidOptionValue + } + + return conn.original.dstAddr, conn.original.dstPort, nil +} diff --git a/pkg/tcpip/stack/fake_time_test.go b/pkg/tcpip/stack/fake_time_test.go new file mode 100644 index 000000000..92c8cb534 --- /dev/null +++ b/pkg/tcpip/stack/fake_time_test.go @@ -0,0 +1,209 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "container/heap" + "sync" + "time" + + "github.com/dpjacques/clockwork" + "gvisor.dev/gvisor/pkg/tcpip" +) + +type fakeClock struct { + clock clockwork.FakeClock + + // mu protects the fields below. + mu sync.RWMutex + + // times is min-heap of times. A heap is used for quick retrieval of the next + // upcoming time of scheduled work. + times *timeHeap + + // waitGroups stores one WaitGroup for all work scheduled to execute at the + // same time via AfterFunc. This allows parallel execution of all functions + // passed to AfterFunc scheduled for the same time. + waitGroups map[time.Time]*sync.WaitGroup +} + +func newFakeClock() *fakeClock { + return &fakeClock{ + clock: clockwork.NewFakeClock(), + times: &timeHeap{}, + waitGroups: make(map[time.Time]*sync.WaitGroup), + } +} + +var _ tcpip.Clock = (*fakeClock)(nil) + +// NowNanoseconds implements tcpip.Clock.NowNanoseconds. +func (fc *fakeClock) NowNanoseconds() int64 { + return fc.clock.Now().UnixNano() +} + +// NowMonotonic implements tcpip.Clock.NowMonotonic. +func (fc *fakeClock) NowMonotonic() int64 { + return fc.NowNanoseconds() +} + +// AfterFunc implements tcpip.Clock.AfterFunc. +func (fc *fakeClock) AfterFunc(d time.Duration, f func()) tcpip.Timer { + until := fc.clock.Now().Add(d) + wg := fc.addWait(until) + return &fakeTimer{ + clock: fc, + until: until, + timer: fc.clock.AfterFunc(d, func() { + defer wg.Done() + f() + }), + } +} + +// addWait adds an additional wait to the WaitGroup for parallel execution of +// all work scheduled for t. Returns a reference to the WaitGroup modified. +func (fc *fakeClock) addWait(t time.Time) *sync.WaitGroup { + fc.mu.RLock() + wg, ok := fc.waitGroups[t] + fc.mu.RUnlock() + + if ok { + wg.Add(1) + return wg + } + + fc.mu.Lock() + heap.Push(fc.times, t) + fc.mu.Unlock() + + wg = &sync.WaitGroup{} + wg.Add(1) + + fc.mu.Lock() + fc.waitGroups[t] = wg + fc.mu.Unlock() + + return wg +} + +// removeWait removes a wait from the WaitGroup for parallel execution of all +// work scheduled for t. +func (fc *fakeClock) removeWait(t time.Time) { + fc.mu.RLock() + defer fc.mu.RUnlock() + + wg := fc.waitGroups[t] + wg.Done() +} + +// advance executes all work that have been scheduled to execute within d from +// the current fake time. Blocks until all work has completed execution. +func (fc *fakeClock) advance(d time.Duration) { + // Block until all the work is done + until := fc.clock.Now().Add(d) + for { + fc.mu.Lock() + if fc.times.Len() == 0 { + fc.mu.Unlock() + return + } + + t := heap.Pop(fc.times).(time.Time) + if t.After(until) { + // No work to do + heap.Push(fc.times, t) + fc.mu.Unlock() + return + } + fc.mu.Unlock() + + diff := t.Sub(fc.clock.Now()) + fc.clock.Advance(diff) + + fc.mu.RLock() + wg := fc.waitGroups[t] + fc.mu.RUnlock() + + wg.Wait() + + fc.mu.Lock() + delete(fc.waitGroups, t) + fc.mu.Unlock() + } +} + +type fakeTimer struct { + clock *fakeClock + timer clockwork.Timer + + mu sync.RWMutex + until time.Time +} + +var _ tcpip.Timer = (*fakeTimer)(nil) + +// Reset implements tcpip.Timer.Reset. +func (ft *fakeTimer) Reset(d time.Duration) { + if !ft.timer.Reset(d) { + return + } + + ft.mu.Lock() + defer ft.mu.Unlock() + + ft.clock.removeWait(ft.until) + ft.until = ft.clock.clock.Now().Add(d) + ft.clock.addWait(ft.until) +} + +// Stop implements tcpip.Timer.Stop. +func (ft *fakeTimer) Stop() bool { + if !ft.timer.Stop() { + return false + } + + ft.mu.RLock() + defer ft.mu.RUnlock() + + ft.clock.removeWait(ft.until) + return true +} + +type timeHeap []time.Time + +var _ heap.Interface = (*timeHeap)(nil) + +func (h timeHeap) Len() int { + return len(h) +} + +func (h timeHeap) Less(i, j int) bool { + return h[i].Before(h[j]) +} + +func (h timeHeap) Swap(i, j int) { + h[i], h[j] = h[j], h[i] +} + +func (h *timeHeap) Push(x interface{}) { + *h = append(*h, x.(time.Time)) +} + +func (h *timeHeap) Pop() interface{} { + last := (*h)[len(*h)-1] + *h = (*h)[:len(*h)-1] + return last +} diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go index cbbae4224..110ba073d 100644 --- a/pkg/tcpip/stack/iptables.go +++ b/pkg/tcpip/stack/iptables.go @@ -218,19 +218,16 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr // Many users never configure iptables. Spare them the cost of rule // traversal if rules have never been set. it.mu.RLock() + defer it.mu.RUnlock() if !it.modified { - it.mu.RUnlock() return true } - it.mu.RUnlock() // Packets are manipulated only if connection and matching // NAT rule exists. shouldTrack := it.connections.handlePacket(pkt, hook, gso, r) // Go through each table containing the hook. - it.mu.RLock() - defer it.mu.RUnlock() priorities := it.priorities[hook] for _, tableID := range priorities { // If handlePacket already NATed the packet, we don't need to @@ -418,3 +415,9 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx // All the matchers matched, so run the target. return rule.Target.Action(pkt, &it.connections, hook, gso, r, address) } + +// OriginalDst returns the original destination of redirected connections. It +// returns an error if the connection doesn't exist or isn't redirected. +func (it *IPTables) OriginalDst(epID TransportEndpointID) (tcpip.Address, uint16, *tcpip.Error) { + return it.connections.originalDst(epID) +} diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go index 9dce11a97..5174e639c 100644 --- a/pkg/tcpip/stack/ndp.go +++ b/pkg/tcpip/stack/ndp.go @@ -33,12 +33,6 @@ const ( // Default = 1 (from RFC 4862 section 5.1) defaultDupAddrDetectTransmits = 1 - // defaultRetransmitTimer is the default amount of time to wait between - // sending NDP Neighbor solicitation messages. - // - // Default = 1s (from RFC 4861 section 10). - defaultRetransmitTimer = time.Second - // defaultMaxRtrSolicitations is the default number of Router // Solicitation messages to send when a NIC becomes enabled. // @@ -79,16 +73,6 @@ const ( // Default = true. defaultAutoGenGlobalAddresses = true - // minimumRetransmitTimer is the minimum amount of time to wait between - // sending NDP Neighbor solicitation messages. Note, RFC 4861 does - // not impose a minimum Retransmit Timer, but we do here to make sure - // the messages are not sent all at once. We also come to this value - // because in the RetransmitTimer field of a Router Advertisement, a - // value of 0 means unspecified, so the smallest valid value is 1. - // Note, the unit of the RetransmitTimer field in the Router - // Advertisement is milliseconds. - minimumRetransmitTimer = time.Millisecond - // minimumRtrSolicitationInterval is the minimum amount of time to wait // between sending Router Solicitation messages. This limit is imposed // to make sure that Router Solicitation messages are not sent all at diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 644ba7c33..5d286ccbc 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -1689,13 +1689,7 @@ func containsV6Addr(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) AddressWithPrefix: item, } - for _, i := range list { - if i == protocolAddress { - return true - } - } - - return false + return containsAddr(list, protocolAddress) } // TestNoAutoGenAddr tests that SLAAC is not performed when configured not to. diff --git a/pkg/tcpip/stack/neighbor_cache.go b/pkg/tcpip/stack/neighbor_cache.go new file mode 100644 index 000000000..27e1feec0 --- /dev/null +++ b/pkg/tcpip/stack/neighbor_cache.go @@ -0,0 +1,333 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "fmt" + "time" + + "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip" +) + +const neighborCacheSize = 512 // max entries per interface + +// neighborCache maps IP addresses to link addresses. It uses the Least +// Recently Used (LRU) eviction strategy to implement a bounded cache for +// dynmically acquired entries. It contains the state machine and configuration +// for running Neighbor Unreachability Detection (NUD). +// +// There are two types of entries in the neighbor cache: +// 1. Dynamic entries are discovered automatically by neighbor discovery +// protocols (e.g. ARP, NDP). These protocols will attempt to reconfirm +// reachability with the device once the entry's state becomes Stale. +// 2. Static entries are explicitly added by a user and have no expiration. +// Their state is always Static. The amount of static entries stored in the +// cache is unbounded. +// +// neighborCache implements NUDHandler. +type neighborCache struct { + nic *NIC + state *NUDState + + // mu protects the fields below. + mu sync.RWMutex + + cache map[tcpip.Address]*neighborEntry + dynamic struct { + lru neighborEntryList + + // count tracks the amount of dynamic entries in the cache. This is + // needed since static entries do not count towards the LRU cache + // eviction strategy. + count uint16 + } +} + +var _ NUDHandler = (*neighborCache)(nil) + +// getOrCreateEntry retrieves a cache entry associated with addr. The +// returned entry is always refreshed in the cache (it is reachable via the +// map, and its place is bumped in LRU). +// +// If a matching entry exists in the cache, it is returned. If no matching +// entry exists and the cache is full, an existing entry is evicted via LRU, +// reset to state incomplete, and returned. If no matching entry exists and the +// cache is not full, a new entry with state incomplete is allocated and +// returned. +func (n *neighborCache) getOrCreateEntry(remoteAddr, localAddr tcpip.Address, linkRes LinkAddressResolver) *neighborEntry { + n.mu.Lock() + defer n.mu.Unlock() + + if entry, ok := n.cache[remoteAddr]; ok { + entry.mu.RLock() + if entry.neigh.State != Static { + n.dynamic.lru.Remove(entry) + n.dynamic.lru.PushFront(entry) + } + entry.mu.RUnlock() + return entry + } + + // The entry that needs to be created must be dynamic since all static + // entries are directly added to the cache via addStaticEntry. + entry := newNeighborEntry(n.nic, remoteAddr, localAddr, n.state, linkRes) + if n.dynamic.count == neighborCacheSize { + e := n.dynamic.lru.Back() + e.mu.Lock() + + delete(n.cache, e.neigh.Addr) + n.dynamic.lru.Remove(e) + n.dynamic.count-- + + e.dispatchRemoveEventLocked() + e.setStateLocked(Unknown) + e.notifyWakersLocked() + e.mu.Unlock() + } + n.cache[remoteAddr] = entry + n.dynamic.lru.PushFront(entry) + n.dynamic.count++ + return entry +} + +// entry looks up the neighbor cache for translating address to link address +// (e.g. IP -> MAC). If the LinkEndpoint requests address resolution and there +// is a LinkAddressResolver registered with the network protocol, the cache +// attempts to resolve the address and returns ErrWouldBlock. If a Waker is +// provided, it will be notified when address resolution is complete (success +// or not). +// +// If address resolution is required, ErrNoLinkAddress and a notification +// channel is returned for the top level caller to block. Channel is closed +// once address resolution is complete (success or not). +func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, linkRes LinkAddressResolver, w *sleep.Waker) (NeighborEntry, <-chan struct{}, *tcpip.Error) { + if linkAddr, ok := linkRes.ResolveStaticAddress(remoteAddr); ok { + e := NeighborEntry{ + Addr: remoteAddr, + LocalAddr: localAddr, + LinkAddr: linkAddr, + State: Static, + UpdatedAt: time.Now(), + } + return e, nil, nil + } + + entry := n.getOrCreateEntry(remoteAddr, localAddr, linkRes) + entry.mu.Lock() + defer entry.mu.Unlock() + + switch s := entry.neigh.State; s { + case Reachable, Static: + return entry.neigh, nil, nil + + case Unknown, Incomplete, Stale, Delay, Probe: + entry.addWakerLocked(w) + + if entry.done == nil { + // Address resolution needs to be initiated. + if linkRes == nil { + return entry.neigh, nil, tcpip.ErrNoLinkAddress + } + entry.done = make(chan struct{}) + } + + entry.handlePacketQueuedLocked() + return entry.neigh, entry.done, tcpip.ErrWouldBlock + + case Failed: + return entry.neigh, nil, tcpip.ErrNoLinkAddress + + default: + panic(fmt.Sprintf("Invalid cache entry state: %s", s)) + } +} + +// removeWaker removes a waker that has been added when link resolution for +// addr was requested. +func (n *neighborCache) removeWaker(addr tcpip.Address, waker *sleep.Waker) { + n.mu.Lock() + if entry, ok := n.cache[addr]; ok { + delete(entry.wakers, waker) + } + n.mu.Unlock() +} + +// entries returns all entries in the neighbor cache. +func (n *neighborCache) entries() []NeighborEntry { + entries := make([]NeighborEntry, 0, len(n.cache)) + n.mu.RLock() + for _, entry := range n.cache { + entry.mu.RLock() + entries = append(entries, entry.neigh) + entry.mu.RUnlock() + } + n.mu.RUnlock() + return entries +} + +// addStaticEntry adds a static entry to the neighbor cache, mapping an IP +// address to a link address. If a dynamic entry exists in the neighbor cache +// with the same address, it will be replaced with this static entry. If a +// static entry exists with the same address but different link address, it +// will be updated with the new link address. If a static entry exists with the +// same address and link address, nothing will happen. +func (n *neighborCache) addStaticEntry(addr tcpip.Address, linkAddr tcpip.LinkAddress) { + n.mu.Lock() + defer n.mu.Unlock() + + if entry, ok := n.cache[addr]; ok { + entry.mu.Lock() + if entry.neigh.State != Static { + // Dynamic entry found with the same address. + n.dynamic.lru.Remove(entry) + n.dynamic.count-- + } else if entry.neigh.LinkAddr == linkAddr { + // Static entry found with the same address and link address. + entry.mu.Unlock() + return + } else { + // Static entry found with the same address but different link address. + entry.neigh.LinkAddr = linkAddr + entry.dispatchChangeEventLocked(entry.neigh.State) + entry.mu.Unlock() + return + } + + // Notify that resolution has been interrupted, just in case the entry was + // in the Incomplete or Probe state. + entry.dispatchRemoveEventLocked() + entry.setStateLocked(Unknown) + entry.notifyWakersLocked() + entry.mu.Unlock() + } + + entry := newStaticNeighborEntry(n.nic, addr, linkAddr, n.state) + n.cache[addr] = entry +} + +// removeEntryLocked removes the specified entry from the neighbor cache. +func (n *neighborCache) removeEntryLocked(entry *neighborEntry) { + if entry.neigh.State != Static { + n.dynamic.lru.Remove(entry) + n.dynamic.count-- + } + if entry.neigh.State != Failed { + entry.dispatchRemoveEventLocked() + } + entry.setStateLocked(Unknown) + entry.notifyWakersLocked() + + delete(n.cache, entry.neigh.Addr) +} + +// removeEntry removes a dynamic or static entry by address from the neighbor +// cache. Returns true if the entry was found and deleted. +func (n *neighborCache) removeEntry(addr tcpip.Address) bool { + n.mu.Lock() + defer n.mu.Unlock() + + entry, ok := n.cache[addr] + if !ok { + return false + } + + entry.mu.Lock() + defer entry.mu.Unlock() + + n.removeEntryLocked(entry) + return true +} + +// clear removes all dynamic and static entries from the neighbor cache. +func (n *neighborCache) clear() { + n.mu.Lock() + defer n.mu.Unlock() + + for _, entry := range n.cache { + entry.mu.Lock() + entry.dispatchRemoveEventLocked() + entry.setStateLocked(Unknown) + entry.notifyWakersLocked() + entry.mu.Unlock() + } + + n.dynamic.lru = neighborEntryList{} + n.cache = make(map[tcpip.Address]*neighborEntry) + n.dynamic.count = 0 +} + +// config returns the NUD configuration. +func (n *neighborCache) config() NUDConfigurations { + return n.state.Config() +} + +// setConfig changes the NUD configuration. +// +// If config contains invalid NUD configuration values, it will be fixed to +// use default values for the erroneous values. +func (n *neighborCache) setConfig(config NUDConfigurations) { + config.resetInvalidFields() + n.state.SetConfig(config) +} + +// HandleProbe implements NUDHandler.HandleProbe by following the logic defined +// in RFC 4861 section 7.2.3. Validation of the probe is expected to be handled +// by the caller. +func (n *neighborCache) HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes LinkAddressResolver) { + entry := n.getOrCreateEntry(remoteAddr, localAddr, linkRes) + entry.mu.Lock() + entry.handleProbeLocked(remoteLinkAddr) + entry.mu.Unlock() +} + +// HandleConfirmation implements NUDHandler.HandleConfirmation by following the +// logic defined in RFC 4861 section 7.2.5. +// +// TODO(gvisor.dev/issue/2277): To protect against ARP poisoning and other +// attacks against NDP functions, Secure Neighbor Discovery (SEND) Protocol +// should be deployed where preventing access to the broadcast segment might +// not be possible. SEND uses RSA key pairs to produce cryptographically +// generated addresses, as defined in RFC 3972, Cryptographically Generated +// Addresses (CGA). This ensures that the claimed source of an NDP message is +// the owner of the claimed address. +func (n *neighborCache) HandleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) { + n.mu.RLock() + entry, ok := n.cache[addr] + n.mu.RUnlock() + if ok { + entry.mu.Lock() + entry.handleConfirmationLocked(linkAddr, flags) + entry.mu.Unlock() + } + // The confirmation SHOULD be silently discarded if the recipient did not + // initiate any communication with the target. This is indicated if there is + // no matching entry for the remote address. +} + +// HandleUpperLevelConfirmation implements +// NUDHandler.HandleUpperLevelConfirmation by following the logic defined in +// RFC 4861 section 7.3.1. +func (n *neighborCache) HandleUpperLevelConfirmation(addr tcpip.Address) { + n.mu.RLock() + entry, ok := n.cache[addr] + n.mu.RUnlock() + if ok { + entry.mu.Lock() + entry.handleUpperLevelConfirmationLocked() + entry.mu.Unlock() + } +} diff --git a/pkg/tcpip/stack/neighbor_cache_test.go b/pkg/tcpip/stack/neighbor_cache_test.go new file mode 100644 index 000000000..b4fa69e3e --- /dev/null +++ b/pkg/tcpip/stack/neighbor_cache_test.go @@ -0,0 +1,1726 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "math/rand" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/tcpip" +) + +const ( + // entryStoreSize is the default number of entries that will be generated and + // added to the entry store. This number needs to be larger than the size of + // the neighbor cache to give ample opportunity for verifying behavior during + // cache overflows. Four times the size of the neighbor cache allows for + // three complete cache overflows. + entryStoreSize = 4 * neighborCacheSize + + // typicalLatency is the typical latency for an ARP or NDP packet to travel + // to a router and back. + typicalLatency = time.Millisecond + + // testEntryBroadcastAddr is a special address that indicates a packet should + // be sent to all nodes. + testEntryBroadcastAddr = tcpip.Address("broadcast") + + // testEntryLocalAddr is the source address of neighbor probes. + testEntryLocalAddr = tcpip.Address("local_addr") + + // testEntryBroadcastLinkAddr is a special link address sent back to + // multicast neighbor probes. + testEntryBroadcastLinkAddr = tcpip.LinkAddress("mac_broadcast") + + // infiniteDuration indicates that a task will not occur in our lifetime. + infiniteDuration = time.Duration(math.MaxInt64) +) + +// entryDiffOpts returns the options passed to cmp.Diff to compare neighbor +// entries. The UpdatedAt field is ignored due to a lack of a deterministic +// method to predict the time that an event will be dispatched. +func entryDiffOpts() []cmp.Option { + return []cmp.Option{ + cmpopts.IgnoreFields(NeighborEntry{}, "UpdatedAt"), + } +} + +// entryDiffOptsWithSort is like entryDiffOpts but also includes an option to +// sort slices of entries for cases where ordering must be ignored. +func entryDiffOptsWithSort() []cmp.Option { + return []cmp.Option{ + cmpopts.IgnoreFields(NeighborEntry{}, "UpdatedAt"), + cmpopts.SortSlices(func(a, b NeighborEntry) bool { + return strings.Compare(string(a.Addr), string(b.Addr)) < 0 + }), + } +} + +func newTestNeighborCache(nudDisp NUDDispatcher, config NUDConfigurations, clock tcpip.Clock) *neighborCache { + config.resetInvalidFields() + rng := rand.New(rand.NewSource(time.Now().UnixNano())) + return &neighborCache{ + nic: &NIC{ + stack: &Stack{ + clock: clock, + nudDisp: nudDisp, + }, + id: 1, + }, + state: NewNUDState(config, rng), + cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize), + } +} + +// testEntryStore contains a set of IP to NeighborEntry mappings. +type testEntryStore struct { + mu sync.RWMutex + entriesMap map[tcpip.Address]NeighborEntry +} + +func toAddress(i int) tcpip.Address { + buf := new(bytes.Buffer) + binary.Write(buf, binary.BigEndian, uint8(1)) + binary.Write(buf, binary.BigEndian, uint8(0)) + binary.Write(buf, binary.BigEndian, uint16(i)) + return tcpip.Address(buf.String()) +} + +func toLinkAddress(i int) tcpip.LinkAddress { + buf := new(bytes.Buffer) + binary.Write(buf, binary.BigEndian, uint8(1)) + binary.Write(buf, binary.BigEndian, uint8(0)) + binary.Write(buf, binary.BigEndian, uint32(i)) + return tcpip.LinkAddress(buf.String()) +} + +// newTestEntryStore returns a testEntryStore pre-populated with entries. +func newTestEntryStore() *testEntryStore { + store := &testEntryStore{ + entriesMap: make(map[tcpip.Address]NeighborEntry), + } + for i := 0; i < entryStoreSize; i++ { + addr := toAddress(i) + linkAddr := toLinkAddress(i) + + store.entriesMap[addr] = NeighborEntry{ + Addr: addr, + LocalAddr: testEntryLocalAddr, + LinkAddr: linkAddr, + } + } + return store +} + +// size returns the number of entries in the store. +func (s *testEntryStore) size() int { + s.mu.RLock() + defer s.mu.RUnlock() + return len(s.entriesMap) +} + +// entry returns the entry at index i. Returns an empty entry and false if i is +// out of bounds. +func (s *testEntryStore) entry(i int) (NeighborEntry, bool) { + return s.entryByAddr(toAddress(i)) +} + +// entryByAddr returns the entry matching addr for situations when the index is +// not available. Returns an empty entry and false if no entries match addr. +func (s *testEntryStore) entryByAddr(addr tcpip.Address) (NeighborEntry, bool) { + s.mu.RLock() + defer s.mu.RUnlock() + entry, ok := s.entriesMap[addr] + return entry, ok +} + +// entries returns all entries in the store. +func (s *testEntryStore) entries() []NeighborEntry { + entries := make([]NeighborEntry, 0, len(s.entriesMap)) + s.mu.RLock() + defer s.mu.RUnlock() + for i := 0; i < entryStoreSize; i++ { + addr := toAddress(i) + if entry, ok := s.entriesMap[addr]; ok { + entries = append(entries, entry) + } + } + return entries +} + +// set modifies the link addresses of an entry. +func (s *testEntryStore) set(i int, linkAddr tcpip.LinkAddress) { + addr := toAddress(i) + s.mu.Lock() + defer s.mu.Unlock() + if entry, ok := s.entriesMap[addr]; ok { + entry.LinkAddr = linkAddr + s.entriesMap[addr] = entry + } +} + +// testNeighborResolver implements LinkAddressResolver to emulate sending a +// neighbor probe. +type testNeighborResolver struct { + clock tcpip.Clock + neigh *neighborCache + entries *testEntryStore + delay time.Duration + onLinkAddressRequest func() +} + +var _ LinkAddressResolver = (*testNeighborResolver)(nil) + +func (r *testNeighborResolver) LinkAddressRequest(addr, localAddr tcpip.Address, linkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error { + // Delay handling the request to emulate network latency. + r.clock.AfterFunc(r.delay, func() { + r.fakeRequest(addr) + }) + + // Execute post address resolution action, if available. + if f := r.onLinkAddressRequest; f != nil { + f() + } + return nil +} + +// fakeRequest emulates handling a response for a link address request. +func (r *testNeighborResolver) fakeRequest(addr tcpip.Address) { + if entry, ok := r.entries.entryByAddr(addr); ok { + r.neigh.HandleConfirmation(addr, entry.LinkAddr, ReachabilityConfirmationFlags{ + Solicited: true, + Override: false, + IsRouter: false, + }) + } +} + +func (*testNeighborResolver) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) { + if addr == testEntryBroadcastAddr { + return testEntryBroadcastLinkAddr, true + } + return "", false +} + +func (*testNeighborResolver) LinkAddressProtocol() tcpip.NetworkProtocolNumber { + return 0 +} + +type entryEvent struct { + nicID tcpip.NICID + address tcpip.Address + linkAddr tcpip.LinkAddress + state NeighborState +} + +func TestNeighborCacheGetConfig(t *testing.T) { + nudDisp := testNUDDispatcher{} + c := DefaultNUDConfigurations() + clock := newFakeClock() + neigh := newTestNeighborCache(&nudDisp, c, clock) + + if got, want := neigh.config(), c; got != want { + t.Errorf("got neigh.config() = %+v, want = %+v", got, want) + } + + // No events should have been dispatched. + nudDisp.mu.Lock() + defer nudDisp.mu.Unlock() + if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } +} + +func TestNeighborCacheSetConfig(t *testing.T) { + nudDisp := testNUDDispatcher{} + c := DefaultNUDConfigurations() + clock := newFakeClock() + neigh := newTestNeighborCache(&nudDisp, c, clock) + + c.MinRandomFactor = 1 + c.MaxRandomFactor = 1 + neigh.setConfig(c) + + if got, want := neigh.config(), c; got != want { + t.Errorf("got neigh.config() = %+v, want = %+v", got, want) + } + + // No events should have been dispatched. + nudDisp.mu.Lock() + defer nudDisp.mu.Unlock() + if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } +} + +func TestNeighborCacheEntry(t *testing.T) { + c := DefaultNUDConfigurations() + nudDisp := testNUDDispatcher{} + clock := newFakeClock() + neigh := newTestNeighborCache(&nudDisp, c, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: typicalLatency, + } + + entry, ok := store.entry(0) + if !ok { + t.Fatalf("store.entry(0) not found") + } + _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + if err != tcpip.ErrWouldBlock { + t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + + clock.advance(typicalLatency) + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + nudDisp.mu.Lock() + diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...) + nudDisp.events = nil + nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + + if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != nil { + t.Fatalf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err) + } + + // No more events should have been dispatched. + nudDisp.mu.Lock() + defer nudDisp.mu.Unlock() + if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } +} + +func TestNeighborCacheRemoveEntry(t *testing.T) { + config := DefaultNUDConfigurations() + + nudDisp := testNUDDispatcher{} + clock := newFakeClock() + neigh := newTestNeighborCache(&nudDisp, config, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: typicalLatency, + } + + entry, ok := store.entry(0) + if !ok { + t.Fatalf("store.entry(0) not found") + } + _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + if err != tcpip.ErrWouldBlock { + t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + + clock.advance(typicalLatency) + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + nudDisp.mu.Lock() + diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...) + nudDisp.events = nil + nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + + neigh.removeEntry(entry.Addr) + + { + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestRemoved, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + nudDisp.mu.Lock() + diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...) + nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + } + + if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock { + t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } +} + +type testContext struct { + clock *fakeClock + neigh *neighborCache + store *testEntryStore + linkRes *testNeighborResolver + nudDisp *testNUDDispatcher +} + +func newTestContext(c NUDConfigurations) testContext { + nudDisp := &testNUDDispatcher{} + clock := newFakeClock() + neigh := newTestNeighborCache(nudDisp, c, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: typicalLatency, + } + + return testContext{ + clock: clock, + neigh: neigh, + store: store, + linkRes: linkRes, + nudDisp: nudDisp, + } +} + +type overflowOptions struct { + startAtEntryIndex int + wantStaticEntries []NeighborEntry +} + +func (c *testContext) overflowCache(opts overflowOptions) error { + // Fill the neighbor cache to capacity to verify the LRU eviction strategy is + // working properly after the entry removal. + for i := opts.startAtEntryIndex; i < c.store.size(); i++ { + // Add a new entry + entry, ok := c.store.entry(i) + if !ok { + return fmt.Errorf("c.store.entry(%d) not found", i) + } + if _, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil); err != tcpip.ErrWouldBlock { + return fmt.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + c.clock.advance(c.neigh.config().RetransmitTimer) + + var wantEvents []testEntryEventInfo + + // When beyond the full capacity, the cache will evict an entry as per the + // LRU eviction strategy. Note that the number of static entries should not + // affect the total number of dynamic entries that can be added. + if i >= neighborCacheSize+opts.startAtEntryIndex { + removedEntry, ok := c.store.entry(i - neighborCacheSize) + if !ok { + return fmt.Errorf("store.entry(%d) not found", i-neighborCacheSize) + } + wantEvents = append(wantEvents, testEntryEventInfo{ + EventType: entryTestRemoved, + NICID: 1, + Addr: removedEntry.Addr, + LinkAddr: removedEntry.LinkAddr, + State: Reachable, + }) + } + + wantEvents = append(wantEvents, testEntryEventInfo{ + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + State: Incomplete, + }, testEntryEventInfo{ + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }) + + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + return fmt.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + } + + // Expect to find only the most recent entries. The order of entries reported + // by entries() is undeterministic, so entries have to be sorted before + // comparison. + wantUnsortedEntries := opts.wantStaticEntries + for i := c.store.size() - neighborCacheSize; i < c.store.size(); i++ { + entry, ok := c.store.entry(i) + if !ok { + return fmt.Errorf("c.store.entry(%d) not found", i) + } + wantEntry := NeighborEntry{ + Addr: entry.Addr, + LocalAddr: entry.LocalAddr, + LinkAddr: entry.LinkAddr, + State: Reachable, + } + wantUnsortedEntries = append(wantUnsortedEntries, wantEntry) + } + + if diff := cmp.Diff(c.neigh.entries(), wantUnsortedEntries, entryDiffOptsWithSort()...); diff != "" { + return fmt.Errorf("neighbor entries mismatch (-got, +want):\n%s", diff) + } + + // No more events should have been dispatched. + c.nudDisp.mu.Lock() + defer c.nudDisp.mu.Unlock() + if diff := cmp.Diff(c.nudDisp.events, []testEntryEventInfo(nil)); diff != "" { + return fmt.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + + return nil +} + +// TestNeighborCacheOverflow verifies that the LRU cache eviction strategy +// respects the dynamic entry count. +func TestNeighborCacheOverflow(t *testing.T) { + config := DefaultNUDConfigurations() + // Stay in Reachable so the cache can overflow + config.BaseReachableTime = infiniteDuration + config.MinRandomFactor = 1 + config.MaxRandomFactor = 1 + + c := newTestContext(config) + opts := overflowOptions{ + startAtEntryIndex: 0, + } + if err := c.overflowCache(opts); err != nil { + t.Errorf("c.overflowCache(%+v): %s", opts, err) + } +} + +// TestNeighborCacheRemoveEntryThenOverflow verifies that the LRU cache +// eviction strategy respects the dynamic entry count when an entry is removed. +func TestNeighborCacheRemoveEntryThenOverflow(t *testing.T) { + config := DefaultNUDConfigurations() + // Stay in Reachable so the cache can overflow + config.BaseReachableTime = infiniteDuration + config.MinRandomFactor = 1 + config.MaxRandomFactor = 1 + + c := newTestContext(config) + + // Add a dynamic entry + entry, ok := c.store.entry(0) + if !ok { + t.Fatalf("c.store.entry(0) not found") + } + _, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil) + if err != tcpip.ErrWouldBlock { + t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + c.clock.advance(c.neigh.config().RetransmitTimer) + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + + // Remove the entry + c.neigh.removeEntry(entry.Addr) + + { + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestRemoved, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + } + + opts := overflowOptions{ + startAtEntryIndex: 0, + } + if err := c.overflowCache(opts); err != nil { + t.Errorf("c.overflowCache(%+v): %s", opts, err) + } +} + +// TestNeighborCacheDuplicateStaticEntryWithSameLinkAddress verifies that +// adding a duplicate static entry with the same link address does not dispatch +// any events. +func TestNeighborCacheDuplicateStaticEntryWithSameLinkAddress(t *testing.T) { + config := DefaultNUDConfigurations() + c := newTestContext(config) + + // Add a static entry + entry, ok := c.store.entry(0) + if !ok { + t.Fatalf("c.store.entry(0) not found") + } + staticLinkAddr := entry.LinkAddr + "static" + c.neigh.addStaticEntry(entry.Addr, staticLinkAddr) + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + LinkAddr: staticLinkAddr, + State: Static, + }, + } + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + + // Remove the static entry that was just added + c.neigh.addStaticEntry(entry.Addr, staticLinkAddr) + + // No more events should have been dispatched. + c.nudDisp.mu.Lock() + defer c.nudDisp.mu.Unlock() + if diff := cmp.Diff(c.nudDisp.events, []testEntryEventInfo(nil)); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } +} + +// TestNeighborCacheDuplicateStaticEntryWithDifferentLinkAddress verifies that +// adding a duplicate static entry with a different link address dispatches a +// change event. +func TestNeighborCacheDuplicateStaticEntryWithDifferentLinkAddress(t *testing.T) { + config := DefaultNUDConfigurations() + c := newTestContext(config) + + // Add a static entry + entry, ok := c.store.entry(0) + if !ok { + t.Fatalf("c.store.entry(0) not found") + } + staticLinkAddr := entry.LinkAddr + "static" + c.neigh.addStaticEntry(entry.Addr, staticLinkAddr) + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + LinkAddr: staticLinkAddr, + State: Static, + }, + } + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + + // Add a duplicate entry with a different link address + staticLinkAddr += "duplicate" + c.neigh.addStaticEntry(entry.Addr, staticLinkAddr) + { + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: staticLinkAddr, + State: Static, + }, + } + c.nudDisp.mu.Lock() + defer c.nudDisp.mu.Unlock() + if diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + } +} + +// TestNeighborCacheRemoveStaticEntryThenOverflow verifies that the LRU cache +// eviction strategy respects the dynamic entry count when a static entry is +// added then removed. In this case, the dynamic entry count shouldn't have +// been touched. +func TestNeighborCacheRemoveStaticEntryThenOverflow(t *testing.T) { + config := DefaultNUDConfigurations() + // Stay in Reachable so the cache can overflow + config.BaseReachableTime = infiniteDuration + config.MinRandomFactor = 1 + config.MaxRandomFactor = 1 + + c := newTestContext(config) + + // Add a static entry + entry, ok := c.store.entry(0) + if !ok { + t.Fatalf("c.store.entry(0) not found") + } + staticLinkAddr := entry.LinkAddr + "static" + c.neigh.addStaticEntry(entry.Addr, staticLinkAddr) + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + LinkAddr: staticLinkAddr, + State: Static, + }, + } + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + + // Remove the static entry that was just added + c.neigh.removeEntry(entry.Addr) + { + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestRemoved, + NICID: 1, + Addr: entry.Addr, + LinkAddr: staticLinkAddr, + State: Static, + }, + } + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + } + + opts := overflowOptions{ + startAtEntryIndex: 0, + } + if err := c.overflowCache(opts); err != nil { + t.Errorf("c.overflowCache(%+v): %s", opts, err) + } +} + +// TestNeighborCacheOverwriteWithStaticEntryThenOverflow verifies that the LRU +// cache eviction strategy keeps count of the dynamic entry count when an entry +// is overwritten by a static entry. Static entries should not count towards +// the size of the LRU cache. +func TestNeighborCacheOverwriteWithStaticEntryThenOverflow(t *testing.T) { + config := DefaultNUDConfigurations() + // Stay in Reachable so the cache can overflow + config.BaseReachableTime = infiniteDuration + config.MinRandomFactor = 1 + config.MaxRandomFactor = 1 + + c := newTestContext(config) + + // Add a dynamic entry + entry, ok := c.store.entry(0) + if !ok { + t.Fatalf("c.store.entry(0) not found") + } + _, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil) + if err != tcpip.ErrWouldBlock { + t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + c.clock.advance(typicalLatency) + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + + // Override the entry with a static one using the same address + staticLinkAddr := entry.LinkAddr + "static" + c.neigh.addStaticEntry(entry.Addr, staticLinkAddr) + { + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestRemoved, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + LinkAddr: staticLinkAddr, + State: Static, + }, + } + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + } + + opts := overflowOptions{ + startAtEntryIndex: 1, + wantStaticEntries: []NeighborEntry{ + { + Addr: entry.Addr, + LocalAddr: "", // static entries don't need a local address + LinkAddr: staticLinkAddr, + State: Static, + }, + }, + } + if err := c.overflowCache(opts); err != nil { + t.Errorf("c.overflowCache(%+v): %s", opts, err) + } +} + +func TestNeighborCacheNotifiesWaker(t *testing.T) { + config := DefaultNUDConfigurations() + + nudDisp := testNUDDispatcher{} + clock := newFakeClock() + neigh := newTestNeighborCache(&nudDisp, config, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: typicalLatency, + } + + w := sleep.Waker{} + s := sleep.Sleeper{} + const wakerID = 1 + s.AddWaker(&w, wakerID) + + entry, ok := store.entry(0) + if !ok { + t.Fatalf("store.entry(0) not found") + } + _, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, &w) + if err != tcpip.ErrWouldBlock { + t.Fatalf("got neigh.entry(%s, %s, _, _ = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + if doneCh == nil { + t.Fatalf("expected done channel from neigh.entry(%s, %s, _, _)", entry.Addr, entry.LocalAddr) + } + clock.advance(typicalLatency) + + select { + case <-doneCh: + default: + t.Fatal("expected notification from done channel") + } + + id, ok := s.Fetch(false /* block */) + if !ok { + t.Errorf("expected waker to be notified after neigh.entry(%s, %s, _, _)", entry.Addr, entry.LocalAddr) + } + if id != wakerID { + t.Errorf("got s.Fetch(false) = %d, want = %d", id, wakerID) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + nudDisp.mu.Lock() + defer nudDisp.mu.Unlock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } +} + +func TestNeighborCacheRemoveWaker(t *testing.T) { + config := DefaultNUDConfigurations() + + nudDisp := testNUDDispatcher{} + clock := newFakeClock() + neigh := newTestNeighborCache(&nudDisp, config, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: typicalLatency, + } + + w := sleep.Waker{} + s := sleep.Sleeper{} + const wakerID = 1 + s.AddWaker(&w, wakerID) + + entry, ok := store.entry(0) + if !ok { + t.Fatalf("store.entry(0) not found") + } + _, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, &w) + if err != tcpip.ErrWouldBlock { + t.Fatalf("got neigh.entry(%s, %s, _, _) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + if doneCh == nil { + t.Fatalf("expected done channel from neigh.entry(%s, %s, _, _)", entry.Addr, entry.LocalAddr) + } + + // Remove the waker before the neighbor cache has the opportunity to send a + // notification. + neigh.removeWaker(entry.Addr, &w) + clock.advance(typicalLatency) + + select { + case <-doneCh: + default: + t.Fatal("expected notification from done channel") + } + + if id, ok := s.Fetch(false /* block */); ok { + t.Errorf("unexpected notification from waker with id %d", id) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + nudDisp.mu.Lock() + defer nudDisp.mu.Unlock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } +} + +func TestNeighborCacheAddStaticEntryThenOverflow(t *testing.T) { + config := DefaultNUDConfigurations() + // Stay in Reachable so the cache can overflow + config.BaseReachableTime = infiniteDuration + config.MinRandomFactor = 1 + config.MaxRandomFactor = 1 + + c := newTestContext(config) + + entry, ok := c.store.entry(0) + if !ok { + t.Fatalf("c.store.entry(0) not found") + } + c.neigh.addStaticEntry(entry.Addr, entry.LinkAddr) + e, _, err := c.neigh.entry(entry.Addr, "", c.linkRes, nil) + if err != nil { + t.Errorf("unexpected error from c.neigh.entry(%s, \"\", _, nil): %s", entry.Addr, err) + } + want := NeighborEntry{ + Addr: entry.Addr, + LocalAddr: "", // static entries don't need a local address + LinkAddr: entry.LinkAddr, + State: Static, + } + if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" { + t.Errorf("c.neigh.entry(%s, \"\", _, nil) mismatch (-got, +want):\n%s", entry.Addr, diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Static, + }, + } + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + + opts := overflowOptions{ + startAtEntryIndex: 1, + wantStaticEntries: []NeighborEntry{ + { + Addr: entry.Addr, + LocalAddr: "", // static entries don't need a local address + LinkAddr: entry.LinkAddr, + State: Static, + }, + }, + } + if err := c.overflowCache(opts); err != nil { + t.Errorf("c.overflowCache(%+v): %s", opts, err) + } +} + +func TestNeighborCacheClear(t *testing.T) { + config := DefaultNUDConfigurations() + + nudDisp := testNUDDispatcher{} + clock := newFakeClock() + neigh := newTestNeighborCache(&nudDisp, config, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: typicalLatency, + } + + // Add a dynamic entry. + entry, ok := store.entry(0) + if !ok { + t.Fatalf("store.entry(0) not found") + } + _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + if err != tcpip.ErrWouldBlock { + t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + clock.advance(typicalLatency) + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + nudDisp.mu.Lock() + diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...) + nudDisp.events = nil + nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + + // Add a static entry. + neigh.addStaticEntry(entryTestAddr1, entryTestLinkAddr1) + + { + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Static, + }, + } + nudDisp.mu.Lock() + diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...) + nudDisp.events = nil + nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + } + + // Clear shoud remove both dynamic and static entries. + neigh.clear() + + // Remove events dispatched from clear() have no deterministic order so they + // need to be sorted beforehand. + wantUnsortedEvents := []testEntryEventInfo{ + { + EventType: entryTestRemoved, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + { + EventType: entryTestRemoved, + NICID: 1, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Static, + }, + } + nudDisp.mu.Lock() + defer nudDisp.mu.Unlock() + if diff := cmp.Diff(nudDisp.events, wantUnsortedEvents, eventDiffOptsWithSort()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } +} + +// TestNeighborCacheClearThenOverflow verifies that the LRU cache eviction +// strategy keeps count of the dynamic entry count when all entries are +// cleared. +func TestNeighborCacheClearThenOverflow(t *testing.T) { + config := DefaultNUDConfigurations() + // Stay in Reachable so the cache can overflow + config.BaseReachableTime = infiniteDuration + config.MinRandomFactor = 1 + config.MaxRandomFactor = 1 + + c := newTestContext(config) + + // Add a dynamic entry + entry, ok := c.store.entry(0) + if !ok { + t.Fatalf("c.store.entry(0) not found") + } + _, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil) + if err != tcpip.ErrWouldBlock { + t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + c.clock.advance(typicalLatency) + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + + // Clear the cache. + c.neigh.clear() + { + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestRemoved, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + c.nudDisp.mu.Lock() + diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...) + c.nudDisp.events = nil + c.nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + } + + opts := overflowOptions{ + startAtEntryIndex: 0, + } + if err := c.overflowCache(opts); err != nil { + t.Errorf("c.overflowCache(%+v): %s", opts, err) + } +} + +func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) { + config := DefaultNUDConfigurations() + // Stay in Reachable so the cache can overflow + config.BaseReachableTime = infiniteDuration + config.MinRandomFactor = 1 + config.MaxRandomFactor = 1 + + nudDisp := testNUDDispatcher{} + clock := newFakeClock() + neigh := newTestNeighborCache(&nudDisp, config, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: typicalLatency, + } + + frequentlyUsedEntry, ok := store.entry(0) + if !ok { + t.Fatalf("store.entry(0) not found") + } + + // The following logic is very similar to overflowCache, but + // periodically refreshes the frequently used entry. + + // Fill the neighbor cache to capacity + for i := 0; i < neighborCacheSize; i++ { + entry, ok := store.entry(i) + if !ok { + t.Fatalf("store.entry(%d) not found", i) + } + _, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + if err != tcpip.ErrWouldBlock { + t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + clock.advance(typicalLatency) + select { + case <-doneCh: + default: + t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr) + } + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + nudDisp.mu.Lock() + diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...) + nudDisp.events = nil + nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + } + + // Keep adding more entries + for i := neighborCacheSize; i < store.size(); i++ { + // Periodically refresh the frequently used entry + if i%(neighborCacheSize/2) == 0 { + _, _, err := neigh.entry(frequentlyUsedEntry.Addr, frequentlyUsedEntry.LocalAddr, linkRes, nil) + if err != nil { + t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", frequentlyUsedEntry.Addr, frequentlyUsedEntry.LocalAddr, err) + } + } + + entry, ok := store.entry(i) + if !ok { + t.Fatalf("store.entry(%d) not found", i) + } + _, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + if err != tcpip.ErrWouldBlock { + t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + clock.advance(typicalLatency) + select { + case <-doneCh: + default: + t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr) + } + + // An entry should have been removed, as per the LRU eviction strategy + removedEntry, ok := store.entry(i - neighborCacheSize + 1) + if !ok { + t.Fatalf("store.entry(%d) not found", i-neighborCacheSize+1) + } + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestRemoved, + NICID: 1, + Addr: removedEntry.Addr, + LinkAddr: removedEntry.LinkAddr, + State: Reachable, + }, + { + EventType: entryTestAdded, + NICID: 1, + Addr: entry.Addr, + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: 1, + Addr: entry.Addr, + LinkAddr: entry.LinkAddr, + State: Reachable, + }, + } + nudDisp.mu.Lock() + diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...) + nudDisp.events = nil + nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + } + + // Expect to find only the frequently used entry and the most recent entries. + // The order of entries reported by entries() is undeterministic, so entries + // have to be sorted before comparison. + wantUnsortedEntries := []NeighborEntry{ + { + Addr: frequentlyUsedEntry.Addr, + LocalAddr: frequentlyUsedEntry.LocalAddr, + LinkAddr: frequentlyUsedEntry.LinkAddr, + State: Reachable, + }, + } + + for i := store.size() - neighborCacheSize + 1; i < store.size(); i++ { + entry, ok := store.entry(i) + if !ok { + t.Fatalf("store.entry(%d) not found", i) + } + wantEntry := NeighborEntry{ + Addr: entry.Addr, + LocalAddr: entry.LocalAddr, + LinkAddr: entry.LinkAddr, + State: Reachable, + } + wantUnsortedEntries = append(wantUnsortedEntries, wantEntry) + } + + if diff := cmp.Diff(neigh.entries(), wantUnsortedEntries, entryDiffOptsWithSort()...); diff != "" { + t.Errorf("neighbor entries mismatch (-got, +want):\n%s", diff) + } + + // No more events should have been dispatched. + nudDisp.mu.Lock() + defer nudDisp.mu.Unlock() + if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } +} + +func TestNeighborCacheConcurrent(t *testing.T) { + const concurrentProcesses = 16 + + config := DefaultNUDConfigurations() + + nudDisp := testNUDDispatcher{} + clock := newFakeClock() + neigh := newTestNeighborCache(&nudDisp, config, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: typicalLatency, + } + + storeEntries := store.entries() + for _, entry := range storeEntries { + var wg sync.WaitGroup + for r := 0; r < concurrentProcesses; r++ { + wg.Add(1) + go func(entry NeighborEntry) { + defer wg.Done() + e, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + if err != nil && err != tcpip.ErrWouldBlock { + t.Errorf("got neigh.entry(%s, %s, _, nil) = (%+v, _, %s), want (_, _, nil) or (_, _, %s)", entry.Addr, entry.LocalAddr, e, err, tcpip.ErrWouldBlock) + } + }(entry) + } + + // Wait for all gorountines to send a request + wg.Wait() + + // Process all the requests for a single entry concurrently + clock.advance(typicalLatency) + } + + // All goroutines add in the same order and add more values than can fit in + // the cache. Our eviction strategy requires that the last entries are + // present, up to the size of the neighbor cache, and the rest are missing. + // The order of entries reported by entries() is undeterministic, so entries + // have to be sorted before comparison. + var wantUnsortedEntries []NeighborEntry + for i := store.size() - neighborCacheSize; i < store.size(); i++ { + entry, ok := store.entry(i) + if !ok { + t.Errorf("store.entry(%d) not found", i) + } + wantEntry := NeighborEntry{ + Addr: entry.Addr, + LocalAddr: entry.LocalAddr, + LinkAddr: entry.LinkAddr, + State: Reachable, + } + wantUnsortedEntries = append(wantUnsortedEntries, wantEntry) + } + + if diff := cmp.Diff(neigh.entries(), wantUnsortedEntries, entryDiffOptsWithSort()...); diff != "" { + t.Errorf("neighbor entries mismatch (-got, +want):\n%s", diff) + } +} + +func TestNeighborCacheReplace(t *testing.T) { + config := DefaultNUDConfigurations() + + nudDisp := testNUDDispatcher{} + clock := newFakeClock() + neigh := newTestNeighborCache(&nudDisp, config, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: typicalLatency, + } + + // Add an entry + entry, ok := store.entry(0) + if !ok { + t.Fatalf("store.entry(0) not found") + } + _, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + if err != tcpip.ErrWouldBlock { + t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + clock.advance(typicalLatency) + select { + case <-doneCh: + default: + t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr) + } + + // Verify the entry exists + e, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + if err != nil { + t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err) + } + if doneCh != nil { + t.Errorf("unexpected done channel from neigh.entry(%s, %s, _, nil): %v", entry.Addr, entry.LocalAddr, doneCh) + } + if t.Failed() { + t.FailNow() + } + want := NeighborEntry{ + Addr: entry.Addr, + LocalAddr: entry.LocalAddr, + LinkAddr: entry.LinkAddr, + State: Reachable, + } + if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" { + t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LinkAddr, diff) + } + + // Notify of a link address change + var updatedLinkAddr tcpip.LinkAddress + { + entry, ok := store.entry(1) + if !ok { + t.Fatalf("store.entry(1) not found") + } + updatedLinkAddr = entry.LinkAddr + } + store.set(0, updatedLinkAddr) + neigh.HandleConfirmation(entry.Addr, updatedLinkAddr, ReachabilityConfirmationFlags{ + Solicited: false, + Override: true, + IsRouter: false, + }) + + // Requesting the entry again should start address resolution + { + _, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + if err != tcpip.ErrWouldBlock { + t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + clock.advance(config.DelayFirstProbeTime + typicalLatency) + select { + case <-doneCh: + default: + t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr) + } + } + + // Verify the entry's new link address + { + e, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + clock.advance(typicalLatency) + if err != nil { + t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err) + } + want = NeighborEntry{ + Addr: entry.Addr, + LocalAddr: entry.LocalAddr, + LinkAddr: updatedLinkAddr, + State: Reachable, + } + if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" { + t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LocalAddr, diff) + } + } +} + +func TestNeighborCacheResolutionFailed(t *testing.T) { + config := DefaultNUDConfigurations() + + nudDisp := testNUDDispatcher{} + clock := newFakeClock() + neigh := newTestNeighborCache(&nudDisp, config, clock) + store := newTestEntryStore() + + var requestCount uint32 + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: typicalLatency, + onLinkAddressRequest: func() { + atomic.AddUint32(&requestCount, 1) + }, + } + + // First, sanity check that resolution is working + entry, ok := store.entry(0) + if !ok { + t.Fatalf("store.entry(0) not found") + } + if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock { + t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + clock.advance(typicalLatency) + got, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + if err != nil { + t.Fatalf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err) + } + want := NeighborEntry{ + Addr: entry.Addr, + LocalAddr: entry.LocalAddr, + LinkAddr: entry.LinkAddr, + State: Reachable, + } + if diff := cmp.Diff(got, want, entryDiffOpts()...); diff != "" { + t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LocalAddr, diff) + } + + // Verify that address resolution for an unknown address returns ErrNoLinkAddress + before := atomic.LoadUint32(&requestCount) + + entry.Addr += "2" + if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock { + t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + waitFor := config.DelayFirstProbeTime + typicalLatency*time.Duration(config.MaxMulticastProbes) + clock.advance(waitFor) + if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrNoLinkAddress { + t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrNoLinkAddress) + } + + maxAttempts := neigh.config().MaxUnicastProbes + if got, want := atomic.LoadUint32(&requestCount)-before, maxAttempts; got != want { + t.Errorf("got link address request count = %d, want = %d", got, want) + } +} + +// TestNeighborCacheResolutionTimeout simulates sending MaxMulticastProbes +// probes and not retrieving a confirmation before the duration defined by +// MaxMulticastProbes * RetransmitTimer. +func TestNeighborCacheResolutionTimeout(t *testing.T) { + config := DefaultNUDConfigurations() + config.RetransmitTimer = time.Millisecond // small enough to cause timeout + + clock := newFakeClock() + neigh := newTestNeighborCache(nil, config, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: time.Minute, // large enough to cause timeout + } + + entry, ok := store.entry(0) + if !ok { + t.Fatalf("store.entry(0) not found") + } + if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock { + t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + waitFor := config.RetransmitTimer * time.Duration(config.MaxMulticastProbes) + clock.advance(waitFor) + if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrNoLinkAddress { + t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrNoLinkAddress) + } +} + +// TestNeighborCacheStaticResolution checks that static link addresses are +// resolved immediately and don't send resolution requests. +func TestNeighborCacheStaticResolution(t *testing.T) { + config := DefaultNUDConfigurations() + clock := newFakeClock() + neigh := newTestNeighborCache(nil, config, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: typicalLatency, + } + + got, _, err := neigh.entry(testEntryBroadcastAddr, testEntryLocalAddr, linkRes, nil) + if err != nil { + t.Fatalf("unexpected error from neigh.entry(%s, %s, _, nil): %s", testEntryBroadcastAddr, testEntryLocalAddr, err) + } + want := NeighborEntry{ + Addr: testEntryBroadcastAddr, + LocalAddr: testEntryLocalAddr, + LinkAddr: testEntryBroadcastLinkAddr, + State: Static, + } + if diff := cmp.Diff(got, want, entryDiffOpts()...); diff != "" { + t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", testEntryBroadcastAddr, testEntryLocalAddr, diff) + } +} + +func BenchmarkCacheClear(b *testing.B) { + b.StopTimer() + config := DefaultNUDConfigurations() + clock := &tcpip.StdClock{} + neigh := newTestNeighborCache(nil, config, clock) + store := newTestEntryStore() + linkRes := &testNeighborResolver{ + clock: clock, + neigh: neigh, + entries: store, + delay: 0, + } + + // Clear for every possible size of the cache + for cacheSize := 0; cacheSize < neighborCacheSize; cacheSize++ { + // Fill the neighbor cache to capacity. + for i := 0; i < cacheSize; i++ { + entry, ok := store.entry(i) + if !ok { + b.Fatalf("store.entry(%d) not found", i) + } + _, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil) + if err != tcpip.ErrWouldBlock { + b.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock) + } + if doneCh != nil { + <-doneCh + } + } + + b.StartTimer() + neigh.clear() + b.StopTimer() + } +} diff --git a/pkg/tcpip/stack/neighbor_entry.go b/pkg/tcpip/stack/neighbor_entry.go new file mode 100644 index 000000000..0068cacb8 --- /dev/null +++ b/pkg/tcpip/stack/neighbor_entry.go @@ -0,0 +1,482 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "fmt" + "sync" + "time" + + "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/tcpip" +) + +// NeighborEntry describes a neighboring device in the local network. +type NeighborEntry struct { + Addr tcpip.Address + LocalAddr tcpip.Address + LinkAddr tcpip.LinkAddress + State NeighborState + UpdatedAt time.Time +} + +// NeighborState defines the state of a NeighborEntry within the Neighbor +// Unreachability Detection state machine, as per RFC 4861 section 7.3.2. +type NeighborState uint8 + +const ( + // Unknown means reachability has not been verified yet. This is the initial + // state of entries that have been created automatically by the Neighbor + // Unreachability Detection state machine. + Unknown NeighborState = iota + // Incomplete means that there is an outstanding request to resolve the + // address. + Incomplete + // Reachable means the path to the neighbor is functioning properly for both + // receive and transmit paths. + Reachable + // Stale means reachability to the neighbor is unknown, but packets are still + // able to be transmitted to the possibly stale link address. + Stale + // Delay means reachability to the neighbor is unknown and pending + // confirmation from an upper-level protocol like TCP, but packets are still + // able to be transmitted to the possibly stale link address. + Delay + // Probe means a reachability confirmation is actively being sought by + // periodically retransmitting reachability probes until a reachability + // confirmation is received, or until the max amount of probes has been sent. + Probe + // Static describes entries that have been explicitly added by the user. They + // do not expire and are not deleted until explicitly removed. + Static + // Failed means traffic should not be sent to this neighbor since attempts of + // reachability have returned inconclusive. + Failed +) + +// neighborEntry implements a neighbor entry's individual node behavior, as per +// RFC 4861 section 7.3.3. Neighbor Unreachability Detection operates in +// parallel with the sending of packets to a neighbor, necessitating the +// entry's lock to be acquired for all operations. +type neighborEntry struct { + neighborEntryEntry + + nic *NIC + protocol tcpip.NetworkProtocolNumber + + // linkRes provides the functionality to send reachability probes, used in + // Neighbor Unreachability Detection. + linkRes LinkAddressResolver + + // nudState points to the Neighbor Unreachability Detection configuration. + nudState *NUDState + + // mu protects the fields below. + mu sync.RWMutex + + neigh NeighborEntry + + // wakers is a set of waiters for address resolution result. Anytime state + // transitions out of incomplete these waiters are notified. It is nil iff + // address resolution is ongoing and no clients are waiting for the result. + wakers map[*sleep.Waker]struct{} + + // done is used to allow callers to wait on address resolution. It is nil + // iff nudState is not Reachable and address resolution is not yet in + // progress. + done chan struct{} + + isRouter bool + job *tcpip.Job +} + +// newNeighborEntry creates a neighbor cache entry starting at the default +// state, Unknown. Transition out of Unknown by calling either +// `handlePacketQueuedLocked` or `handleProbeLocked` on the newly created +// neighborEntry. +func newNeighborEntry(nic *NIC, remoteAddr tcpip.Address, localAddr tcpip.Address, nudState *NUDState, linkRes LinkAddressResolver) *neighborEntry { + return &neighborEntry{ + nic: nic, + linkRes: linkRes, + nudState: nudState, + neigh: NeighborEntry{ + Addr: remoteAddr, + LocalAddr: localAddr, + State: Unknown, + }, + } +} + +// newStaticNeighborEntry creates a neighbor cache entry starting at the Static +// state. The entry can only transition out of Static by directly calling +// `setStateLocked`. +func newStaticNeighborEntry(nic *NIC, addr tcpip.Address, linkAddr tcpip.LinkAddress, state *NUDState) *neighborEntry { + if nic.stack.nudDisp != nil { + nic.stack.nudDisp.OnNeighborAdded(nic.id, addr, linkAddr, Static, time.Now()) + } + return &neighborEntry{ + nic: nic, + nudState: state, + neigh: NeighborEntry{ + Addr: addr, + LinkAddr: linkAddr, + State: Static, + UpdatedAt: time.Now(), + }, + } +} + +// addWaker adds w to the list of wakers waiting for address resolution. +// Assumes the entry has already been appropriately locked. +func (e *neighborEntry) addWakerLocked(w *sleep.Waker) { + if w == nil { + return + } + if e.wakers == nil { + e.wakers = make(map[*sleep.Waker]struct{}) + } + e.wakers[w] = struct{}{} +} + +// notifyWakersLocked notifies those waiting for address resolution, whether it +// succeeded or failed. Assumes the entry has already been appropriately locked. +func (e *neighborEntry) notifyWakersLocked() { + for w := range e.wakers { + w.Assert() + } + e.wakers = nil + if ch := e.done; ch != nil { + close(ch) + e.done = nil + } +} + +// dispatchAddEventLocked signals to stack's NUD Dispatcher that the entry has +// been added. +func (e *neighborEntry) dispatchAddEventLocked(nextState NeighborState) { + if nudDisp := e.nic.stack.nudDisp; nudDisp != nil { + nudDisp.OnNeighborAdded(e.nic.id, e.neigh.Addr, e.neigh.LinkAddr, nextState, time.Now()) + } +} + +// dispatchChangeEventLocked signals to stack's NUD Dispatcher that the entry +// has changed state or link-layer address. +func (e *neighborEntry) dispatchChangeEventLocked(nextState NeighborState) { + if nudDisp := e.nic.stack.nudDisp; nudDisp != nil { + nudDisp.OnNeighborChanged(e.nic.id, e.neigh.Addr, e.neigh.LinkAddr, nextState, time.Now()) + } +} + +// dispatchRemoveEventLocked signals to stack's NUD Dispatcher that the entry +// has been removed. +func (e *neighborEntry) dispatchRemoveEventLocked() { + if nudDisp := e.nic.stack.nudDisp; nudDisp != nil { + nudDisp.OnNeighborRemoved(e.nic.id, e.neigh.Addr, e.neigh.LinkAddr, e.neigh.State, time.Now()) + } +} + +// setStateLocked transitions the entry to the specified state immediately. +// +// Follows the logic defined in RFC 4861 section 7.3.3. +// +// e.mu MUST be locked. +func (e *neighborEntry) setStateLocked(next NeighborState) { + // Cancel the previously scheduled action, if there is one. Entries in + // Unknown, Stale, or Static state do not have scheduled actions. + if timer := e.job; timer != nil { + timer.Cancel() + } + + prev := e.neigh.State + e.neigh.State = next + e.neigh.UpdatedAt = time.Now() + config := e.nudState.Config() + + switch next { + case Incomplete: + var retryCounter uint32 + var sendMulticastProbe func() + + sendMulticastProbe = func() { + if retryCounter == config.MaxMulticastProbes { + // "If no Neighbor Advertisement is received after + // MAX_MULTICAST_SOLICIT solicitations, address resolution has failed. + // The sender MUST return ICMP destination unreachable indications with + // code 3 (Address Unreachable) for each packet queued awaiting address + // resolution." - RFC 4861 section 7.2.2 + // + // There is no need to send an ICMP destination unreachable indication + // since the failure to resolve the address is expected to only occur + // on this node. Thus, redirecting traffic is currently not supported. + // + // "If the error occurs on a node other than the node originating the + // packet, an ICMP error message is generated. If the error occurs on + // the originating node, an implementation is not required to actually + // create and send an ICMP error packet to the source, as long as the + // upper-layer sender is notified through an appropriate mechanism + // (e.g. return value from a procedure call). Note, however, that an + // implementation may find it convenient in some cases to return errors + // to the sender by taking the offending packet, generating an ICMP + // error message, and then delivering it (locally) through the generic + // error-handling routines.' - RFC 4861 section 2.1 + e.dispatchRemoveEventLocked() + e.setStateLocked(Failed) + return + } + + if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, e.neigh.LocalAddr, "", e.nic.linkEP); err != nil { + // There is no need to log the error here; the NUD implementation may + // assume a working link. A valid link should be the responsibility of + // the NIC/stack.LinkEndpoint. + e.dispatchRemoveEventLocked() + e.setStateLocked(Failed) + return + } + + retryCounter++ + e.job = e.nic.stack.newJob(&e.mu, sendMulticastProbe) + e.job.Schedule(config.RetransmitTimer) + } + + sendMulticastProbe() + + case Reachable: + e.job = e.nic.stack.newJob(&e.mu, func() { + e.dispatchChangeEventLocked(Stale) + e.setStateLocked(Stale) + }) + e.job.Schedule(e.nudState.ReachableTime()) + + case Delay: + e.job = e.nic.stack.newJob(&e.mu, func() { + e.dispatchChangeEventLocked(Probe) + e.setStateLocked(Probe) + }) + e.job.Schedule(config.DelayFirstProbeTime) + + case Probe: + var retryCounter uint32 + var sendUnicastProbe func() + + sendUnicastProbe = func() { + if retryCounter == config.MaxUnicastProbes { + e.dispatchRemoveEventLocked() + e.setStateLocked(Failed) + return + } + + if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, e.neigh.LocalAddr, e.neigh.LinkAddr, e.nic.linkEP); err != nil { + e.dispatchRemoveEventLocked() + e.setStateLocked(Failed) + return + } + + retryCounter++ + if retryCounter == config.MaxUnicastProbes { + e.dispatchRemoveEventLocked() + e.setStateLocked(Failed) + return + } + + e.job = e.nic.stack.newJob(&e.mu, sendUnicastProbe) + e.job.Schedule(config.RetransmitTimer) + } + + sendUnicastProbe() + + case Failed: + e.notifyWakersLocked() + e.job = e.nic.stack.newJob(&e.mu, func() { + e.nic.neigh.removeEntryLocked(e) + }) + e.job.Schedule(config.UnreachableTime) + + case Unknown, Stale, Static: + // Do nothing + + default: + panic(fmt.Sprintf("Invalid state transition from %q to %q", prev, next)) + } +} + +// handlePacketQueuedLocked advances the state machine according to a packet +// being queued for outgoing transmission. +// +// Follows the logic defined in RFC 4861 section 7.3.3. +func (e *neighborEntry) handlePacketQueuedLocked() { + switch e.neigh.State { + case Unknown: + e.dispatchAddEventLocked(Incomplete) + e.setStateLocked(Incomplete) + + case Stale: + e.dispatchChangeEventLocked(Delay) + e.setStateLocked(Delay) + + case Incomplete, Reachable, Delay, Probe, Static, Failed: + // Do nothing + + default: + panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State)) + } +} + +// handleProbeLocked processes an incoming neighbor probe (e.g. ARP request or +// Neighbor Solicitation for ARP or NDP, respectively). +// +// Follows the logic defined in RFC 4861 section 7.2.3. +func (e *neighborEntry) handleProbeLocked(remoteLinkAddr tcpip.LinkAddress) { + // Probes MUST be silently discarded if the target address is tentative, does + // not exist, or not bound to the NIC as per RFC 4861 section 7.2.3. These + // checks MUST be done by the NetworkEndpoint. + + switch e.neigh.State { + case Unknown, Incomplete, Failed: + e.neigh.LinkAddr = remoteLinkAddr + e.dispatchAddEventLocked(Stale) + e.setStateLocked(Stale) + e.notifyWakersLocked() + + case Reachable, Delay, Probe: + if e.neigh.LinkAddr != remoteLinkAddr { + e.neigh.LinkAddr = remoteLinkAddr + e.dispatchChangeEventLocked(Stale) + e.setStateLocked(Stale) + } + + case Stale: + if e.neigh.LinkAddr != remoteLinkAddr { + e.neigh.LinkAddr = remoteLinkAddr + e.dispatchChangeEventLocked(Stale) + } + + case Static: + // Do nothing + + default: + panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State)) + } +} + +// handleConfirmationLocked processes an incoming neighbor confirmation +// (e.g. ARP reply or Neighbor Advertisement for ARP or NDP, respectively). +// +// Follows the state machine defined by RFC 4861 section 7.2.5. +// +// TODO(gvisor.dev/issue/2277): To protect against ARP poisoning and other +// attacks against NDP functions, Secure Neighbor Discovery (SEND) Protocol +// should be deployed where preventing access to the broadcast segment might +// not be possible. SEND uses RSA key pairs to produce Cryptographically +// Generated Addresses (CGA), as defined in RFC 3972. This ensures that the +// claimed source of an NDP message is the owner of the claimed address. +func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) { + switch e.neigh.State { + case Incomplete: + if len(linkAddr) == 0 { + // "If the link layer has addresses and no Target Link-Layer Address + // option is included, the receiving node SHOULD silently discard the + // received advertisement." - RFC 4861 section 7.2.5 + break + } + + e.neigh.LinkAddr = linkAddr + if flags.Solicited { + e.dispatchChangeEventLocked(Reachable) + e.setStateLocked(Reachable) + } else { + e.dispatchChangeEventLocked(Stale) + e.setStateLocked(Stale) + } + e.isRouter = flags.IsRouter + e.notifyWakersLocked() + + // "Note that the Override flag is ignored if the entry is in the + // INCOMPLETE state." - RFC 4861 section 7.2.5 + + case Reachable, Stale, Delay, Probe: + sameLinkAddr := e.neigh.LinkAddr == linkAddr + + if !sameLinkAddr { + if !flags.Override { + if e.neigh.State == Reachable { + e.dispatchChangeEventLocked(Stale) + e.setStateLocked(Stale) + } + break + } + + e.neigh.LinkAddr = linkAddr + + if !flags.Solicited { + if e.neigh.State != Stale { + e.dispatchChangeEventLocked(Stale) + e.setStateLocked(Stale) + } else { + // Notify the LinkAddr change, even though NUD state hasn't changed. + e.dispatchChangeEventLocked(e.neigh.State) + } + break + } + } + + if flags.Solicited && (flags.Override || sameLinkAddr) { + if e.neigh.State != Reachable { + e.dispatchChangeEventLocked(Reachable) + } + // Set state to Reachable again to refresh timers. + e.setStateLocked(Reachable) + e.notifyWakersLocked() + } + + if e.isRouter && !flags.IsRouter { + // "In those cases where the IsRouter flag changes from TRUE to FALSE as + // a result of this update, the node MUST remove that router from the + // Default Router List and update the Destination Cache entries for all + // destinations using that neighbor as a router as specified in Section + // 7.3.3. This is needed to detect when a node that is used as a router + // stops forwarding packets due to being configured as a host." + // - RFC 4861 section 7.2.5 + e.nic.mu.Lock() + e.nic.mu.ndp.invalidateDefaultRouter(e.neigh.Addr) + e.nic.mu.Unlock() + } + e.isRouter = flags.IsRouter + + case Unknown, Failed, Static: + // Do nothing + + default: + panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State)) + } +} + +// handleUpperLevelConfirmationLocked processes an incoming upper-level protocol +// (e.g. TCP acknowledgements) reachability confirmation. +func (e *neighborEntry) handleUpperLevelConfirmationLocked() { + switch e.neigh.State { + case Reachable, Stale, Delay, Probe: + if e.neigh.State != Reachable { + e.dispatchChangeEventLocked(Reachable) + // Set state to Reachable again to refresh timers. + } + e.setStateLocked(Reachable) + + case Unknown, Incomplete, Failed, Static: + // Do nothing + + default: + panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State)) + } +} diff --git a/pkg/tcpip/stack/neighbor_entry_test.go b/pkg/tcpip/stack/neighbor_entry_test.go new file mode 100644 index 000000000..b769fb2fa --- /dev/null +++ b/pkg/tcpip/stack/neighbor_entry_test.go @@ -0,0 +1,2870 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "fmt" + "math" + "math/rand" + "strings" + "sync" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/tcpip" +) + +const ( + entryTestNetNumber tcpip.NetworkProtocolNumber = math.MaxUint32 + + entryTestNICID tcpip.NICID = 1 + entryTestAddr1 = tcpip.Address("\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01") + entryTestAddr2 = tcpip.Address("\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02") + + entryTestLinkAddr1 = tcpip.LinkAddress("\x0a\x00\x00\x00\x00\x01") + entryTestLinkAddr2 = tcpip.LinkAddress("\x0a\x00\x00\x00\x00\x02") + + // entryTestNetDefaultMTU is the MTU, in bytes, used throughout the tests, + // except where another value is explicitly used. It is chosen to match the + // MTU of loopback interfaces on Linux systems. + entryTestNetDefaultMTU = 65536 +) + +// eventDiffOpts are the options passed to cmp.Diff to compare entry events. +// The UpdatedAt field is ignored due to a lack of a deterministic method to +// predict the time that an event will be dispatched. +func eventDiffOpts() []cmp.Option { + return []cmp.Option{ + cmpopts.IgnoreFields(testEntryEventInfo{}, "UpdatedAt"), + } +} + +// eventDiffOptsWithSort is like eventDiffOpts but also includes an option to +// sort slices of events for cases where ordering must be ignored. +func eventDiffOptsWithSort() []cmp.Option { + return []cmp.Option{ + cmpopts.IgnoreFields(testEntryEventInfo{}, "UpdatedAt"), + cmpopts.SortSlices(func(a, b testEntryEventInfo) bool { + return strings.Compare(string(a.Addr), string(b.Addr)) < 0 + }), + } +} + +// The following unit tests exercise every state transition and verify its +// behavior with RFC 4681. +// +// | From | To | Cause | Action | Event | +// | ========== | ========== | ========================================== | =============== | ======= | +// | Unknown | Unknown | Confirmation w/ unknown address | | Added | +// | Unknown | Incomplete | Packet queued to unknown address | Send probe | Added | +// | Unknown | Stale | Probe w/ unknown address | | Added | +// | Incomplete | Incomplete | Retransmit timer expired | Send probe | Changed | +// | Incomplete | Reachable | Solicited confirmation | Notify wakers | Changed | +// | Incomplete | Stale | Unsolicited confirmation | Notify wakers | Changed | +// | Incomplete | Failed | Max probes sent without reply | Notify wakers | Removed | +// | Reachable | Reachable | Confirmation w/ different isRouter flag | Update IsRouter | | +// | Reachable | Stale | Reachable timer expired | | Changed | +// | Reachable | Stale | Probe or confirmation w/ different address | | Changed | +// | Stale | Reachable | Solicited override confirmation | Update LinkAddr | Changed | +// | Stale | Stale | Override confirmation | Update LinkAddr | Changed | +// | Stale | Stale | Probe w/ different address | Update LinkAddr | Changed | +// | Stale | Delay | Packet sent | | Changed | +// | Delay | Reachable | Upper-layer confirmation | | Changed | +// | Delay | Reachable | Solicited override confirmation | Update LinkAddr | Changed | +// | Delay | Stale | Probe or confirmation w/ different address | | Changed | +// | Delay | Probe | Delay timer expired | Send probe | Changed | +// | Probe | Reachable | Solicited override confirmation | Update LinkAddr | Changed | +// | Probe | Reachable | Solicited confirmation w/ same address | Notify wakers | Changed | +// | Probe | Stale | Probe or confirmation w/ different address | | Changed | +// | Probe | Probe | Retransmit timer expired | Send probe | Changed | +// | Probe | Failed | Max probes sent without reply | Notify wakers | Removed | +// | Failed | | Unreachability timer expired | Delete entry | | + +type testEntryEventType uint8 + +const ( + entryTestAdded testEntryEventType = iota + entryTestChanged + entryTestRemoved +) + +func (t testEntryEventType) String() string { + switch t { + case entryTestAdded: + return "add" + case entryTestChanged: + return "change" + case entryTestRemoved: + return "remove" + default: + return fmt.Sprintf("unknown (%d)", t) + } +} + +// Fields are exported for use with cmp.Diff. +type testEntryEventInfo struct { + EventType testEntryEventType + NICID tcpip.NICID + Addr tcpip.Address + LinkAddr tcpip.LinkAddress + State NeighborState + UpdatedAt time.Time +} + +func (e testEntryEventInfo) String() string { + return fmt.Sprintf("%s event for NIC #%d, addr=%q, linkAddr=%q, state=%q", e.EventType, e.NICID, e.Addr, e.LinkAddr, e.State) +} + +// testNUDDispatcher implements NUDDispatcher to validate the dispatching of +// events upon certain NUD state machine events. +type testNUDDispatcher struct { + mu sync.Mutex + events []testEntryEventInfo +} + +var _ NUDDispatcher = (*testNUDDispatcher)(nil) + +func (d *testNUDDispatcher) queueEvent(e testEntryEventInfo) { + d.mu.Lock() + defer d.mu.Unlock() + d.events = append(d.events, e) +} + +func (d *testNUDDispatcher) OnNeighborAdded(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) { + d.queueEvent(testEntryEventInfo{ + EventType: entryTestAdded, + NICID: nicID, + Addr: addr, + LinkAddr: linkAddr, + State: state, + UpdatedAt: updatedAt, + }) +} + +func (d *testNUDDispatcher) OnNeighborChanged(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) { + d.queueEvent(testEntryEventInfo{ + EventType: entryTestChanged, + NICID: nicID, + Addr: addr, + LinkAddr: linkAddr, + State: state, + UpdatedAt: updatedAt, + }) +} + +func (d *testNUDDispatcher) OnNeighborRemoved(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) { + d.queueEvent(testEntryEventInfo{ + EventType: entryTestRemoved, + NICID: nicID, + Addr: addr, + LinkAddr: linkAddr, + State: state, + UpdatedAt: updatedAt, + }) +} + +type entryTestLinkResolver struct { + mu sync.Mutex + probes []entryTestProbeInfo +} + +var _ LinkAddressResolver = (*entryTestLinkResolver)(nil) + +type entryTestProbeInfo struct { + RemoteAddress tcpip.Address + RemoteLinkAddress tcpip.LinkAddress + LocalAddress tcpip.Address +} + +func (p entryTestProbeInfo) String() string { + return fmt.Sprintf("probe with RemoteAddress=%q, RemoteLinkAddress=%q, LocalAddress=%q", p.RemoteAddress, p.RemoteLinkAddress, p.LocalAddress) +} + +// LinkAddressRequest sends a request for the LinkAddress of addr. Broadcasts +// to the local network if linkAddr is the zero value. +func (r *entryTestLinkResolver) LinkAddressRequest(addr, localAddr tcpip.Address, linkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error { + p := entryTestProbeInfo{ + RemoteAddress: addr, + RemoteLinkAddress: linkAddr, + LocalAddress: localAddr, + } + r.mu.Lock() + defer r.mu.Unlock() + r.probes = append(r.probes, p) + return nil +} + +// ResolveStaticAddress attempts to resolve address without sending requests. +// It either resolves the name immediately or returns the empty LinkAddress. +func (r *entryTestLinkResolver) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) { + return "", false +} + +// LinkAddressProtocol returns the network protocol of the addresses this +// resolver can resolve. +func (r *entryTestLinkResolver) LinkAddressProtocol() tcpip.NetworkProtocolNumber { + return entryTestNetNumber +} + +func entryTestSetup(c NUDConfigurations) (*neighborEntry, *testNUDDispatcher, *entryTestLinkResolver, *fakeClock) { + clock := newFakeClock() + disp := testNUDDispatcher{} + nic := NIC{ + id: entryTestNICID, + linkEP: nil, // entryTestLinkResolver doesn't use a LinkEndpoint + stack: &Stack{ + clock: clock, + nudDisp: &disp, + }, + } + + rng := rand.New(rand.NewSource(time.Now().UnixNano())) + nudState := NewNUDState(c, rng) + linkRes := entryTestLinkResolver{} + entry := newNeighborEntry(&nic, entryTestAddr1 /* remoteAddr */, entryTestAddr2 /* localAddr */, nudState, &linkRes) + + // Stub out ndpState to verify modification of default routers. + nic.mu.ndp = ndpState{ + nic: &nic, + defaultRouters: make(map[tcpip.Address]defaultRouterState), + } + + // Stub out the neighbor cache to verify deletion from the cache. + nic.neigh = &neighborCache{ + nic: &nic, + state: nudState, + cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize), + } + nic.neigh.cache[entryTestAddr1] = entry + + return entry, &disp, &linkRes, clock +} + +// TestEntryInitiallyUnknown verifies that the state of a newly created +// neighborEntry is Unknown. +func TestEntryInitiallyUnknown(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + if got, want := e.neigh.State, Unknown; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + clock.advance(c.RetransmitTimer) + + // No probes should have been sent. + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, []entryTestProbeInfo(nil)) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + // No events should have been dispatched. + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryUnknownToUnknownWhenConfirmationWithUnknownAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Unknown; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + clock.advance(time.Hour) + + // No probes should have been sent. + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, []entryTestProbeInfo(nil)) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + // No events should have been dispatched. + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryUnknownToIncomplete(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Incomplete; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + } + { + nudDisp.mu.Lock() + diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...) + nudDisp.mu.Unlock() + if diff != "" { + t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + } +} + +func TestEntryUnknownToStale(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handleProbeLocked(entryTestLinkAddr1) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + // No probes should have been sent. + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, []entryTestProbeInfo(nil)) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryIncompleteToIncompleteDoesNotChangeUpdatedAt(t *testing.T) { + c := DefaultNUDConfigurations() + c.MaxMulticastProbes = 3 + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Incomplete; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + updatedAt := e.neigh.UpdatedAt + e.mu.Unlock() + + clock.advance(c.RetransmitTimer) + + // UpdatedAt should remain the same during address resolution. + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.probes = nil + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + e.mu.Lock() + if got, want := e.neigh.UpdatedAt, updatedAt; got != want { + t.Errorf("got e.neigh.UpdatedAt = %q, want = %q", got, want) + } + e.mu.Unlock() + + clock.advance(c.RetransmitTimer) + + // UpdatedAt should change after failing address resolution. Timing out after + // sending the last probe transitions the entry to Failed. + { + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + } + + clock.advance(c.RetransmitTimer) + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestRemoved, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + e.mu.Lock() + if got, notWant := e.neigh.UpdatedAt, updatedAt; got == notWant { + t.Errorf("expected e.neigh.UpdatedAt to change, got = %q", got) + } + e.mu.Unlock() +} + +func TestEntryIncompleteToReachable(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Incomplete; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: true, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Reachable, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +// TestEntryAddsAndClearsWakers verifies that wakers are added when +// addWakerLocked is called and cleared when address resolution finishes. In +// this case, address resolution will finish when transitioning from Incomplete +// to Reachable. +func TestEntryAddsAndClearsWakers(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + w := sleep.Waker{} + s := sleep.Sleeper{} + s.AddWaker(&w, 123) + defer s.Done() + + e.mu.Lock() + e.handlePacketQueuedLocked() + if got := e.wakers; got != nil { + t.Errorf("got e.wakers = %v, want = nil", got) + } + e.addWakerLocked(&w) + if got, want := w.IsAsserted(), false; got != want { + t.Errorf("waker.IsAsserted() = %t, want = %t", got, want) + } + if e.wakers == nil { + t.Error("expected e.wakers to be non-nil") + } + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: true, + Override: false, + IsRouter: false, + }) + if e.wakers != nil { + t.Errorf("got e.wakers = %v, want = nil", e.wakers) + } + if got, want := w.IsAsserted(), true; got != want { + t.Errorf("waker.IsAsserted() = %t, want = %t", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Reachable, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryIncompleteToReachableWithRouterFlag(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Incomplete; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: true, + Override: false, + IsRouter: true, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.isRouter, true; got != want { + t.Errorf("got e.isRouter = %t, want = %t", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + if diff := cmp.Diff(linkRes.probes, wantProbes); diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + linkRes.mu.Unlock() + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Reachable, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryIncompleteToStale(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Incomplete; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryIncompleteToFailed(t *testing.T) { + c := DefaultNUDConfigurations() + c.MaxMulticastProbes = 3 + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Incomplete; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + waitFor := c.RetransmitTimer * time.Duration(c.MaxMulticastProbes) + clock.advance(waitFor) + + wantProbes := []entryTestProbeInfo{ + // The Incomplete-to-Incomplete state transition is tested here by + // verifying that 3 reachability probes were sent. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestRemoved, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + e.mu.Lock() + if got, want := e.neigh.State, Failed; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() +} + +type testLocker struct{} + +var _ sync.Locker = (*testLocker)(nil) + +func (*testLocker) Lock() {} +func (*testLocker) Unlock() {} + +func TestEntryStaysReachableWhenConfirmationWithRouterFlag(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: true, + Override: false, + IsRouter: true, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.isRouter, true; got != want { + t.Errorf("got e.isRouter = %t, want = %t", got, want) + } + e.nic.mu.ndp.defaultRouters[entryTestAddr1] = defaultRouterState{ + invalidationJob: e.nic.stack.newJob(&testLocker{}, func() {}), + } + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + if got, want := e.isRouter, false; got != want { + t.Errorf("got e.isRouter = %t, want = %t", got, want) + } + if _, ok := e.nic.mu.ndp.defaultRouters[entryTestAddr1]; ok { + t.Errorf("unexpected defaultRouter for %s", entryTestAddr1) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Reachable, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + e.mu.Lock() + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() +} + +func TestEntryStaysReachableWhenProbeWithSameAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: true, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleProbeLocked(entryTestLinkAddr1) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.neigh.LinkAddr, entryTestLinkAddr1; got != want { + t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Reachable, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryReachableToStaleWhenTimeout(t *testing.T) { + c := DefaultNUDConfigurations() + // Eliminate random factors from ReachableTime computation so the transition + // from Stale to Reachable will only take BaseReachableTime duration. + c.MinRandomFactor = 1 + c.MaxRandomFactor = 1 + + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: true, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + clock.advance(c.BaseReachableTime) + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Reachable, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + e.mu.Lock() + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() +} + +func TestEntryReachableToStaleWhenProbeWithDifferentAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: true, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleProbeLocked(entryTestLinkAddr2) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Reachable, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + e.mu.Lock() + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() +} + +func TestEntryReachableToStaleWhenConfirmationWithDifferentAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: true, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Reachable, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + e.mu.Lock() + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() +} + +func TestEntryReachableToStaleWhenConfirmationWithDifferentAddressAndOverride(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: true, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{ + Solicited: false, + Override: true, + IsRouter: false, + }) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Reachable, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + e.mu.Lock() + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() +} + +func TestEntryStaysStaleWhenProbeWithSameAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleProbeLocked(entryTestLinkAddr1) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.neigh.LinkAddr, entryTestLinkAddr1; got != want { + t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryStaleToReachableWhenSolicitedOverrideConfirmation(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{ + Solicited: true, + Override: true, + IsRouter: false, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want { + t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Reachable, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryStaleToStaleWhenOverrideConfirmation(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{ + Solicited: false, + Override: true, + IsRouter: false, + }) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want { + t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryStaleToStaleWhenProbeUpdateAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleProbeLocked(entryTestLinkAddr2) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want { + t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryStaleToDelay(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Delay; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryDelayToReachableWhenUpperLevelConfirmation(t *testing.T) { + c := DefaultNUDConfigurations() + // Eliminate random factors from ReachableTime computation so the transition + // from Stale to Reachable will only take BaseReachableTime duration. + c.MinRandomFactor = 1 + c.MaxRandomFactor = 1 + + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Delay; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleUpperLevelConfirmationLocked() + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + clock.advance(c.BaseReachableTime) + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Reachable, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryDelayToReachableWhenSolicitedOverrideConfirmation(t *testing.T) { + c := DefaultNUDConfigurations() + c.MaxMulticastProbes = 1 + // Eliminate random factors from ReachableTime computation so the transition + // from Stale to Reachable will only take BaseReachableTime duration. + c.MinRandomFactor = 1 + c.MaxRandomFactor = 1 + + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Delay; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{ + Solicited: true, + Override: true, + IsRouter: false, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want { + t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + clock.advance(c.BaseReachableTime) + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Reachable, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryStaysDelayWhenOverrideConfirmationWithSameAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Delay; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: true, + IsRouter: false, + }) + if got, want := e.neigh.State, Delay; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.neigh.LinkAddr, entryTestLinkAddr1; got != want { + t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryDelayToStaleWhenProbeWithDifferentAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Delay; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleProbeLocked(entryTestLinkAddr2) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryDelayToStaleWhenConfirmationWithDifferentAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, _ := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Delay; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{ + Solicited: false, + Override: true, + IsRouter: false, + }) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryDelayToProbe(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + if got, want := e.neigh.State, Delay; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + clock.advance(c.DelayFirstProbeTime) + + wantProbes := []entryTestProbeInfo{ + // The first probe is caused by the Unknown-to-Incomplete transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + // The second probe is caused by the Delay-to-Probe transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + e.mu.Lock() + if got, want := e.neigh.State, Probe; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() +} + +func TestEntryProbeToStaleWhenProbeWithDifferentAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + e.mu.Unlock() + + clock.advance(c.DelayFirstProbeTime) + + wantProbes := []entryTestProbeInfo{ + // The first probe is caused by the Unknown-to-Incomplete transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + // The second probe is caused by the Delay-to-Probe transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + e.mu.Lock() + if got, want := e.neigh.State, Probe; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleProbeLocked(entryTestLinkAddr2) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + e.mu.Lock() + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() +} + +func TestEntryProbeToStaleWhenConfirmationWithDifferentAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + e.mu.Unlock() + + clock.advance(c.DelayFirstProbeTime) + + wantProbes := []entryTestProbeInfo{ + // The first probe is caused by the Unknown-to-Incomplete transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + // The second probe is caused by the Delay-to-Probe transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + e.mu.Lock() + if got, want := e.neigh.State, Probe; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{ + Solicited: false, + Override: true, + IsRouter: false, + }) + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + e.mu.Lock() + if got, want := e.neigh.State, Stale; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() +} + +func TestEntryStaysProbeWhenOverrideConfirmationWithSameAddress(t *testing.T) { + c := DefaultNUDConfigurations() + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + e.mu.Unlock() + + clock.advance(c.DelayFirstProbeTime) + + wantProbes := []entryTestProbeInfo{ + // The first probe is caused by the Unknown-to-Incomplete transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + // The second probe is caused by the Delay-to-Probe transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + e.mu.Lock() + if got, want := e.neigh.State, Probe; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: true, + IsRouter: false, + }) + if got, want := e.neigh.State, Probe; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.neigh.LinkAddr, entryTestLinkAddr1; got != want { + t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want) + } + e.mu.Unlock() + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +// TestEntryUnknownToStaleToProbeToReachable exercises the following scenario: +// 1. Probe is received +// 2. Entry is created in Stale +// 3. Packet is queued on the entry +// 4. Entry transitions to Delay then Probe +// 5. Probe is sent +func TestEntryUnknownToStaleToProbeToReachable(t *testing.T) { + c := DefaultNUDConfigurations() + // Eliminate random factors from ReachableTime computation so the transition + // from Probe to Reachable will only take BaseReachableTime duration. + c.MinRandomFactor = 1 + c.MaxRandomFactor = 1 + + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handleProbeLocked(entryTestLinkAddr1) + e.handlePacketQueuedLocked() + e.mu.Unlock() + + clock.advance(c.DelayFirstProbeTime) + + wantProbes := []entryTestProbeInfo{ + // Probe caused by the Delay-to-Probe transition + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + e.mu.Lock() + if got, want := e.neigh.State, Probe; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{ + Solicited: true, + Override: true, + IsRouter: false, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want { + t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want) + } + e.mu.Unlock() + + clock.advance(c.BaseReachableTime) + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Reachable, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryProbeToReachableWhenSolicitedOverrideConfirmation(t *testing.T) { + c := DefaultNUDConfigurations() + // Eliminate random factors from ReachableTime computation so the transition + // from Stale to Reachable will only take BaseReachableTime duration. + c.MinRandomFactor = 1 + c.MaxRandomFactor = 1 + + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + e.mu.Unlock() + + clock.advance(c.DelayFirstProbeTime) + + wantProbes := []entryTestProbeInfo{ + // The first probe is caused by the Unknown-to-Incomplete transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + // The second probe is caused by the Delay-to-Probe transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + e.mu.Lock() + if got, want := e.neigh.State, Probe; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{ + Solicited: true, + Override: true, + IsRouter: false, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want { + t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want) + } + e.mu.Unlock() + + clock.advance(c.BaseReachableTime) + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Reachable, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr2, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryProbeToReachableWhenSolicitedConfirmationWithSameAddress(t *testing.T) { + c := DefaultNUDConfigurations() + // Eliminate random factors from ReachableTime computation so the transition + // from Stale to Reachable will only take BaseReachableTime duration. + c.MinRandomFactor = 1 + c.MaxRandomFactor = 1 + + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + e.mu.Unlock() + + clock.advance(c.DelayFirstProbeTime) + + wantProbes := []entryTestProbeInfo{ + // The first probe is caused by the Unknown-to-Incomplete transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + // The second probe is caused by the Delay-to-Probe transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + e.mu.Lock() + if got, want := e.neigh.State, Probe; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: true, + Override: false, + IsRouter: false, + }) + if got, want := e.neigh.State, Reachable; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() + + clock.advance(c.BaseReachableTime) + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Reachable, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() +} + +func TestEntryProbeToFailed(t *testing.T) { + c := DefaultNUDConfigurations() + c.MaxMulticastProbes = 3 + c.MaxUnicastProbes = 3 + e, nudDisp, linkRes, clock := entryTestSetup(c) + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + e.mu.Unlock() + + waitFor := c.DelayFirstProbeTime + c.RetransmitTimer*time.Duration(c.MaxUnicastProbes) + clock.advance(waitFor) + + wantProbes := []entryTestProbeInfo{ + // The first probe is caused by the Unknown-to-Incomplete transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + // The next three probe are caused by the Delay-to-Probe transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + { + EventType: entryTestRemoved, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + e.mu.Lock() + if got, want := e.neigh.State, Failed; got != want { + t.Errorf("got e.neigh.State = %q, want = %q", got, want) + } + e.mu.Unlock() +} + +func TestEntryFailedGetsDeleted(t *testing.T) { + c := DefaultNUDConfigurations() + c.MaxMulticastProbes = 3 + c.MaxUnicastProbes = 3 + e, nudDisp, linkRes, clock := entryTestSetup(c) + + // Verify the cache contains the entry. + if _, ok := e.nic.neigh.cache[entryTestAddr1]; !ok { + t.Errorf("expected entry %q to exist in the neighbor cache", entryTestAddr1) + } + + e.mu.Lock() + e.handlePacketQueuedLocked() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked() + e.mu.Unlock() + + waitFor := c.DelayFirstProbeTime + c.RetransmitTimer*time.Duration(c.MaxUnicastProbes) + c.UnreachableTime + clock.advance(waitFor) + + wantProbes := []entryTestProbeInfo{ + // The first probe is caused by the Unknown-to-Incomplete transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: tcpip.LinkAddress(""), + LocalAddress: entryTestAddr2, + }, + // The next three probe are caused by the Delay-to-Probe transition. + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + { + EventType: entryTestRemoved, + NICID: entryTestNICID, + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + // Verify the cache no longer contains the entry. + if _, ok := e.nic.neigh.cache[entryTestAddr1]; ok { + t.Errorf("entry %q should have been deleted from the neighbor cache", entryTestAddr1) + } +} diff --git a/pkg/tcpip/stack/neighborstate_string.go b/pkg/tcpip/stack/neighborstate_string.go new file mode 100644 index 000000000..aa7311ec6 --- /dev/null +++ b/pkg/tcpip/stack/neighborstate_string.go @@ -0,0 +1,44 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by "stringer -type NeighborState"; DO NOT EDIT. + +package stack + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[Unknown-0] + _ = x[Incomplete-1] + _ = x[Reachable-2] + _ = x[Stale-3] + _ = x[Delay-4] + _ = x[Probe-5] + _ = x[Static-6] + _ = x[Failed-7] +} + +const _NeighborState_name = "UnknownIncompleteReachableStaleDelayProbeStaticFailed" + +var _NeighborState_index = [...]uint8{0, 7, 17, 26, 31, 36, 41, 47, 53} + +func (i NeighborState) String() string { + if i >= NeighborState(len(_NeighborState_index)-1) { + return "NeighborState(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _NeighborState_name[_NeighborState_index[i]:_NeighborState_index[i+1]] +} diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 9256d4d43..eaaf756cd 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -16,6 +16,7 @@ package stack import ( "fmt" + "math/rand" "reflect" "sort" "strings" @@ -45,6 +46,7 @@ type NIC struct { context NICContext stats NICStats + neigh *neighborCache mu struct { sync.RWMutex @@ -141,6 +143,16 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC nic.mu.packetEPs[netProto.Number()] = []PacketEndpoint{} } + // Check for Neighbor Unreachability Detection support. + if ep.Capabilities()&CapabilityResolutionRequired != 0 && len(stack.linkAddrResolvers) != 0 { + rng := rand.New(rand.NewSource(stack.clock.NowNanoseconds())) + nic.neigh = &neighborCache{ + nic: nic, + state: NewNUDState(stack.nudConfigs, rng), + cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize), + } + } + nic.linkEP.Attach(nic) return nic @@ -205,6 +217,11 @@ func (n *NIC) disableLocked() *tcpip.Error { } if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok { + // The NIC may have already left the multicast group. + if err := n.leaveGroupLocked(header.IPv4AllSystems, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress { + return err + } + // The address may have already been removed. if err := n.removePermanentAddressLocked(ipv4BroadcastAddr.AddressWithPrefix.Address); err != nil && err != tcpip.ErrBadLocalAddress { return err @@ -243,6 +260,13 @@ func (n *NIC) enable() *tcpip.Error { if _, err := n.addAddressLocked(ipv4BroadcastAddr, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil { return err } + + // As per RFC 1122 section 3.3.7, all hosts should join the all-hosts + // multicast group. Note, the IANA calls the all-hosts multicast group the + // all-systems multicast group. + if err := n.joinGroupLocked(header.IPv4ProtocolNumber, header.IPv4AllSystems); err != nil { + return err + } } // Join the IPv6 All-Nodes Multicast group if the stack is configured to @@ -597,6 +621,9 @@ func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.A // If none exists a temporary one may be created if we are in promiscuous mode // or spoofing. Promiscuous mode will only be checked if promiscuous is true. // Similarly, spoofing will only be checked if spoofing is true. +// +// If the address is the IPv4 broadcast address for an endpoint's network, that +// endpoint will be returned. func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, tempRef getRefBehaviour) *referencedNetworkEndpoint { n.mu.RLock() @@ -621,6 +648,16 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t } } + // Check if address is a broadcast address for the endpoint's network. + // + // Only IPv4 has a notion of broadcast addresses. + if protocol == header.IPv4ProtocolNumber { + if ref := n.getRefForBroadcastRLocked(address); ref != nil { + n.mu.RUnlock() + return ref + } + } + // A usable reference was not found, create a temporary one if requested by // the caller or if the address is found in the NIC's subnets. createTempEP := spoofingOrPromiscuous @@ -658,8 +695,34 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t return ref } +// getRefForBroadcastLocked returns an endpoint where address is the IPv4 +// broadcast address for the endpoint's network. +// +// n.mu MUST be read locked. +func (n *NIC) getRefForBroadcastRLocked(address tcpip.Address) *referencedNetworkEndpoint { + for _, ref := range n.mu.endpoints { + // Only IPv4 has a notion of broadcast addresses. + if ref.protocol != header.IPv4ProtocolNumber { + continue + } + + addr := ref.addrWithPrefix() + subnet := addr.Subnet() + if subnet.IsBroadcast(address) && ref.tryIncRef() { + return ref + } + } + + return nil +} + /// getRefOrCreateTempLocked returns an existing endpoint for address or creates /// and returns a temporary endpoint. +// +// If the address is the IPv4 broadcast address for an endpoint's network, that +// endpoint will be returned. +// +// n.mu must be write locked. func (n *NIC) getRefOrCreateTempLocked(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) *referencedNetworkEndpoint { if ref, ok := n.mu.endpoints[NetworkEndpointID{address}]; ok { // No need to check the type as we are ok with expired endpoints at this @@ -673,6 +736,15 @@ func (n *NIC) getRefOrCreateTempLocked(protocol tcpip.NetworkProtocolNumber, add n.removeEndpointLocked(ref) } + // Check if address is a broadcast address for an endpoint's network. + // + // Only IPv4 has a notion of broadcast addresses. + if protocol == header.IPv4ProtocolNumber { + if ref := n.getRefForBroadcastRLocked(address); ref != nil { + return ref + } + } + // Add a new temporary endpoint. netProto, ok := n.stack.networkProtocols[protocol] if !ok { @@ -1540,6 +1612,27 @@ func (n *NIC) setNDPConfigs(c NDPConfigurations) { n.mu.Unlock() } +// NUDConfigs gets the NUD configurations for n. +func (n *NIC) NUDConfigs() (NUDConfigurations, *tcpip.Error) { + if n.neigh == nil { + return NUDConfigurations{}, tcpip.ErrNotSupported + } + return n.neigh.config(), nil +} + +// setNUDConfigs sets the NUD configurations for n. +// +// Note, if c contains invalid NUD configuration values, it will be fixed to +// use default values for the erroneous values. +func (n *NIC) setNUDConfigs(c NUDConfigurations) *tcpip.Error { + if n.neigh == nil { + return tcpip.ErrNotSupported + } + c.resetInvalidFields() + n.neigh.setConfig(c) + return nil +} + // handleNDPRA handles an NDP Router Advertisement message that arrived on n. func (n *NIC) handleNDPRA(ip tcpip.Address, ra header.NDPRouterAdvert) { n.mu.Lock() diff --git a/pkg/tcpip/stack/nud.go b/pkg/tcpip/stack/nud.go new file mode 100644 index 000000000..e1ec15487 --- /dev/null +++ b/pkg/tcpip/stack/nud.go @@ -0,0 +1,466 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "math" + "sync" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" +) + +const ( + // defaultBaseReachableTime is the default base duration for computing the + // random reachable time. + // + // Reachable time is the duration for which a neighbor is considered + // reachable after a positive reachability confirmation is received. It is a + // function of a uniformly distributed random value between the minimum and + // maximum random factors, multiplied by the base reachable time. Using a + // random component eliminates the possibility that Neighbor Unreachability + // Detection messages will synchronize with each other. + // + // Default taken from REACHABLE_TIME of RFC 4861 section 10. + defaultBaseReachableTime = 30 * time.Second + + // minimumBaseReachableTime is the minimum base duration for computing the + // random reachable time. + // + // Minimum = 1ms + minimumBaseReachableTime = time.Millisecond + + // defaultMinRandomFactor is the default minimum value of the random factor + // used for computing reachable time. + // + // Default taken from MIN_RANDOM_FACTOR of RFC 4861 section 10. + defaultMinRandomFactor = 0.5 + + // defaultMaxRandomFactor is the default maximum value of the random factor + // used for computing reachable time. + // + // The default value depends on the value of MinRandomFactor. + // If MinRandomFactor is less than MAX_RANDOM_FACTOR of RFC 4861 section 10, + // the value from the RFC will be used; otherwise, the default is + // MinRandomFactor multiplied by three. + defaultMaxRandomFactor = 1.5 + + // defaultRetransmitTimer is the default amount of time to wait between + // sending reachability probes. + // + // Default taken from RETRANS_TIMER of RFC 4861 section 10. + defaultRetransmitTimer = time.Second + + // minimumRetransmitTimer is the minimum amount of time to wait between + // sending reachability probes. + // + // Note, RFC 4861 does not impose a minimum Retransmit Timer, but we do here + // to make sure the messages are not sent all at once. We also come to this + // value because in the RetransmitTimer field of a Router Advertisement, a + // value of 0 means unspecified, so the smallest valid value is 1. Note, the + // unit of the RetransmitTimer field in the Router Advertisement is + // milliseconds. + minimumRetransmitTimer = time.Millisecond + + // defaultDelayFirstProbeTime is the default duration to wait for a + // non-Neighbor-Discovery related protocol to reconfirm reachability after + // entering the DELAY state. After this time, a reachability probe will be + // sent and the entry will transition to the PROBE state. + // + // Default taken from DELAY_FIRST_PROBE_TIME of RFC 4861 section 10. + defaultDelayFirstProbeTime = 5 * time.Second + + // defaultMaxMulticastProbes is the default number of reachabililty probes + // to send before concluding negative reachability and deleting the neighbor + // entry from the INCOMPLETE state. + // + // Default taken from MAX_MULTICAST_SOLICIT of RFC 4861 section 10. + defaultMaxMulticastProbes = 3 + + // defaultMaxUnicastProbes is the default number of reachability probes to + // send before concluding retransmission from within the PROBE state should + // cease and the entry SHOULD be deleted. + // + // Default taken from MAX_UNICASE_SOLICIT of RFC 4861 section 10. + defaultMaxUnicastProbes = 3 + + // defaultMaxAnycastDelayTime is the default time in which the stack SHOULD + // delay sending a response for a random time between 0 and this time, if the + // target address is an anycast address. + // + // Default taken from MAX_ANYCAST_DELAY_TIME of RFC 4861 section 10. + defaultMaxAnycastDelayTime = time.Second + + // defaultMaxReachbilityConfirmations is the default amount of unsolicited + // reachability confirmation messages a node MAY send to all-node multicast + // address when it determines its link-layer address has changed. + // + // Default taken from MAX_NEIGHBOR_ADVERTISEMENT of RFC 4861 section 10. + defaultMaxReachbilityConfirmations = 3 + + // defaultUnreachableTime is the default duration for how long an entry will + // remain in the FAILED state before being removed from the neighbor cache. + // + // Note, there is no equivalent protocol constant defined in RFC 4861. It + // leaves the specifics of any garbage collection mechanism up to the + // implementation. + defaultUnreachableTime = 5 * time.Second +) + +// NUDDispatcher is the interface integrators of netstack must implement to +// receive and handle NUD related events. +type NUDDispatcher interface { + // OnNeighborAdded will be called when a new entry is added to a NIC's (with + // ID nicID) neighbor table. + // + // This function is permitted to block indefinitely without interfering with + // the stack's operation. + // + // May be called concurrently. + OnNeighborAdded(nicID tcpip.NICID, ipAddr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) + + // OnNeighborChanged will be called when an entry in a NIC's (with ID nicID) + // neighbor table changes state and/or link address. + // + // This function is permitted to block indefinitely without interfering with + // the stack's operation. + // + // May be called concurrently. + OnNeighborChanged(nicID tcpip.NICID, ipAddr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) + + // OnNeighborRemoved will be called when an entry is removed from a NIC's + // (with ID nicID) neighbor table. + // + // This function is permitted to block indefinitely without interfering with + // the stack's operation. + // + // May be called concurrently. + OnNeighborRemoved(nicID tcpip.NICID, ipAddr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) +} + +// ReachabilityConfirmationFlags describes the flags used within a reachability +// confirmation (e.g. ARP reply or Neighbor Advertisement for ARP or NDP, +// respectively). +type ReachabilityConfirmationFlags struct { + // Solicited indicates that the advertisement was sent in response to a + // reachability probe. + Solicited bool + + // Override indicates that the reachability confirmation should override an + // existing neighbor cache entry and update the cached link-layer address. + // When Override is not set the confirmation will not update a cached + // link-layer address, but will update an existing neighbor cache entry for + // which no link-layer address is known. + Override bool + + // IsRouter indicates that the sender is a router. + IsRouter bool +} + +// NUDHandler communicates external events to the Neighbor Unreachability +// Detection state machine, which is implemented per-interface. This is used by +// network endpoints to inform the Neighbor Cache of probes and confirmations. +type NUDHandler interface { + // HandleProbe processes an incoming neighbor probe (e.g. ARP request or + // Neighbor Solicitation for ARP or NDP, respectively). Validation of the + // probe needs to be performed before calling this function since the + // Neighbor Cache doesn't have access to view the NIC's assigned addresses. + HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes LinkAddressResolver) + + // HandleConfirmation processes an incoming neighbor confirmation (e.g. ARP + // reply or Neighbor Advertisement for ARP or NDP, respectively). + HandleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) + + // HandleUpperLevelConfirmation processes an incoming upper-level protocol + // (e.g. TCP acknowledgements) reachability confirmation. + HandleUpperLevelConfirmation(addr tcpip.Address) +} + +// NUDConfigurations is the NUD configurations for the netstack. This is used +// by the neighbor cache to operate the NUD state machine on each device in the +// local network. +type NUDConfigurations struct { + // BaseReachableTime is the base duration for computing the random reachable + // time. + // + // Reachable time is the duration for which a neighbor is considered + // reachable after a positive reachability confirmation is received. It is a + // function of uniformly distributed random value between minRandomFactor and + // maxRandomFactor multiplied by baseReachableTime. Using a random component + // eliminates the possibility that Neighbor Unreachability Detection messages + // will synchronize with each other. + // + // After this time, a neighbor entry will transition from REACHABLE to STALE + // state. + // + // Must be greater than 0. + BaseReachableTime time.Duration + + // LearnBaseReachableTime enables learning BaseReachableTime during runtime + // from the neighbor discovery protocol, if supported. + // + // TODO(gvisor.dev/issue/2240): Implement this NUD configuration option. + LearnBaseReachableTime bool + + // MinRandomFactor is the minimum value of the random factor used for + // computing reachable time. + // + // See BaseReachbleTime for more information on computing the reachable time. + // + // Must be greater than 0. + MinRandomFactor float32 + + // MaxRandomFactor is the maximum value of the random factor used for + // computing reachabile time. + // + // See BaseReachbleTime for more information on computing the reachable time. + // + // Must be great than or equal to MinRandomFactor. + MaxRandomFactor float32 + + // RetransmitTimer is the duration between retransmission of reachability + // probes in the PROBE state. + RetransmitTimer time.Duration + + // LearnRetransmitTimer enables learning RetransmitTimer during runtime from + // the neighbor discovery protocol, if supported. + // + // TODO(gvisor.dev/issue/2241): Implement this NUD configuration option. + LearnRetransmitTimer bool + + // DelayFirstProbeTime is the duration to wait for a non-Neighbor-Discovery + // related protocol to reconfirm reachability after entering the DELAY state. + // After this time, a reachability probe will be sent and the entry will + // transition to the PROBE state. + // + // Must be greater than 0. + DelayFirstProbeTime time.Duration + + // MaxMulticastProbes is the number of reachability probes to send before + // concluding negative reachability and deleting the neighbor entry from the + // INCOMPLETE state. + // + // Must be greater than 0. + MaxMulticastProbes uint32 + + // MaxUnicastProbes is the number of reachability probes to send before + // concluding retransmission from within the PROBE state should cease and + // entry SHOULD be deleted. + // + // Must be greater than 0. + MaxUnicastProbes uint32 + + // MaxAnycastDelayTime is the time in which the stack SHOULD delay sending a + // response for a random time between 0 and this time, if the target address + // is an anycast address. + // + // TODO(gvisor.dev/issue/2242): Use this option when sending solicited + // neighbor confirmations to anycast addresses and proxying neighbor + // confirmations. + MaxAnycastDelayTime time.Duration + + // MaxReachabilityConfirmations is the number of unsolicited reachability + // confirmation messages a node MAY send to all-node multicast address when + // it determines its link-layer address has changed. + // + // TODO(gvisor.dev/issue/2246): Discuss if implementation of this NUD + // configuration option is necessary. + MaxReachabilityConfirmations uint32 + + // UnreachableTime describes how long an entry will remain in the FAILED + // state before being removed from the neighbor cache. + UnreachableTime time.Duration +} + +// DefaultNUDConfigurations returns a NUDConfigurations populated with default +// values defined by RFC 4861 section 10. +func DefaultNUDConfigurations() NUDConfigurations { + return NUDConfigurations{ + BaseReachableTime: defaultBaseReachableTime, + LearnBaseReachableTime: true, + MinRandomFactor: defaultMinRandomFactor, + MaxRandomFactor: defaultMaxRandomFactor, + RetransmitTimer: defaultRetransmitTimer, + LearnRetransmitTimer: true, + DelayFirstProbeTime: defaultDelayFirstProbeTime, + MaxMulticastProbes: defaultMaxMulticastProbes, + MaxUnicastProbes: defaultMaxUnicastProbes, + MaxAnycastDelayTime: defaultMaxAnycastDelayTime, + MaxReachabilityConfirmations: defaultMaxReachbilityConfirmations, + UnreachableTime: defaultUnreachableTime, + } +} + +// resetInvalidFields modifies an invalid NDPConfigurations with valid values. +// If invalid values are present in c, the corresponding default values will be +// used instead. This is needed to check, and conditionally fix, user-specified +// NUDConfigurations. +func (c *NUDConfigurations) resetInvalidFields() { + if c.BaseReachableTime < minimumBaseReachableTime { + c.BaseReachableTime = defaultBaseReachableTime + } + if c.MinRandomFactor <= 0 { + c.MinRandomFactor = defaultMinRandomFactor + } + if c.MaxRandomFactor < c.MinRandomFactor { + c.MaxRandomFactor = calcMaxRandomFactor(c.MinRandomFactor) + } + if c.RetransmitTimer < minimumRetransmitTimer { + c.RetransmitTimer = defaultRetransmitTimer + } + if c.DelayFirstProbeTime == 0 { + c.DelayFirstProbeTime = defaultDelayFirstProbeTime + } + if c.MaxMulticastProbes == 0 { + c.MaxMulticastProbes = defaultMaxMulticastProbes + } + if c.MaxUnicastProbes == 0 { + c.MaxUnicastProbes = defaultMaxUnicastProbes + } + if c.UnreachableTime == 0 { + c.UnreachableTime = defaultUnreachableTime + } +} + +// calcMaxRandomFactor calculates the maximum value of the random factor used +// for computing reachable time. This function is necessary for when the +// default specified in RFC 4861 section 10 is less than the current +// MinRandomFactor. +// +// Assumes minRandomFactor is positive since validation of the minimum value +// should come before the validation of the maximum. +func calcMaxRandomFactor(minRandomFactor float32) float32 { + if minRandomFactor > defaultMaxRandomFactor { + return minRandomFactor * 3 + } + return defaultMaxRandomFactor +} + +// A Rand is a source of random numbers. +type Rand interface { + // Float32 returns, as a float32, a pseudo-random number in [0.0,1.0). + Float32() float32 +} + +// NUDState stores states needed for calculating reachable time. +type NUDState struct { + rng Rand + + // mu protects the fields below. + // + // It is necessary for NUDState to handle its own locking since neighbor + // entries may access the NUD state from within the goroutine spawned by + // time.AfterFunc(). This goroutine may run concurrently with the main + // process for controlling the neighbor cache and would otherwise introduce + // race conditions if NUDState was not locked properly. + mu sync.RWMutex + + config NUDConfigurations + + // reachableTime is the duration to wait for a REACHABLE entry to + // transition into STALE after inactivity. This value is calculated with + // the algorithm defined in RFC 4861 section 6.3.2. + reachableTime time.Duration + + expiration time.Time + prevBaseReachableTime time.Duration + prevMinRandomFactor float32 + prevMaxRandomFactor float32 +} + +// NewNUDState returns new NUDState using c as configuration and the specified +// random number generator for use in recomputing ReachableTime. +func NewNUDState(c NUDConfigurations, rng Rand) *NUDState { + s := &NUDState{ + rng: rng, + } + s.config = c + return s +} + +// Config returns the NUD configuration. +func (s *NUDState) Config() NUDConfigurations { + s.mu.RLock() + defer s.mu.RUnlock() + return s.config +} + +// SetConfig replaces the existing NUD configurations with c. +func (s *NUDState) SetConfig(c NUDConfigurations) { + s.mu.Lock() + defer s.mu.Unlock() + s.config = c +} + +// ReachableTime returns the duration to wait for a REACHABLE entry to +// transition into STALE after inactivity. This value is recalculated for new +// values of BaseReachableTime, MinRandomFactor, and MaxRandomFactor using the +// algorithm defined in RFC 4861 section 6.3.2. +func (s *NUDState) ReachableTime() time.Duration { + s.mu.Lock() + defer s.mu.Unlock() + + if time.Now().After(s.expiration) || + s.config.BaseReachableTime != s.prevBaseReachableTime || + s.config.MinRandomFactor != s.prevMinRandomFactor || + s.config.MaxRandomFactor != s.prevMaxRandomFactor { + return s.recomputeReachableTimeLocked() + } + return s.reachableTime +} + +// recomputeReachableTimeLocked forces a recalculation of ReachableTime using +// the algorithm defined in RFC 4861 section 6.3.2. +// +// This SHOULD automatically be invoked during certain situations, as per +// RFC 4861 section 6.3.4: +// +// If the received Reachable Time value is non-zero, the host SHOULD set its +// BaseReachableTime variable to the received value. If the new value +// differs from the previous value, the host SHOULD re-compute a new random +// ReachableTime value. ReachableTime is computed as a uniformly +// distributed random value between MIN_RANDOM_FACTOR and MAX_RANDOM_FACTOR +// times the BaseReachableTime. Using a random component eliminates the +// possibility that Neighbor Unreachability Detection messages will +// synchronize with each other. +// +// In most cases, the advertised Reachable Time value will be the same in +// consecutive Router Advertisements, and a host's BaseReachableTime rarely +// changes. In such cases, an implementation SHOULD ensure that a new +// random value gets re-computed at least once every few hours. +// +// s.mu MUST be locked for writing. +func (s *NUDState) recomputeReachableTimeLocked() time.Duration { + s.prevBaseReachableTime = s.config.BaseReachableTime + s.prevMinRandomFactor = s.config.MinRandomFactor + s.prevMaxRandomFactor = s.config.MaxRandomFactor + + randomFactor := s.config.MinRandomFactor + s.rng.Float32()*(s.config.MaxRandomFactor-s.config.MinRandomFactor) + + // Check for overflow, given that minRandomFactor and maxRandomFactor are + // guaranteed to be positive numbers. + if float32(math.MaxInt64)/randomFactor < float32(s.config.BaseReachableTime) { + s.reachableTime = time.Duration(math.MaxInt64) + } else if randomFactor == 1 { + // Avoid loss of precision when a large base reachable time is used. + s.reachableTime = s.config.BaseReachableTime + } else { + reachableTime := int64(float32(s.config.BaseReachableTime) * randomFactor) + s.reachableTime = time.Duration(reachableTime) + } + + s.expiration = time.Now().Add(2 * time.Hour) + return s.reachableTime +} diff --git a/pkg/tcpip/stack/nud_test.go b/pkg/tcpip/stack/nud_test.go new file mode 100644 index 000000000..2494ee610 --- /dev/null +++ b/pkg/tcpip/stack/nud_test.go @@ -0,0 +1,795 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack_test + +import ( + "math" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/channel" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +const ( + defaultBaseReachableTime = 30 * time.Second + minimumBaseReachableTime = time.Millisecond + defaultMinRandomFactor = 0.5 + defaultMaxRandomFactor = 1.5 + defaultRetransmitTimer = time.Second + minimumRetransmitTimer = time.Millisecond + defaultDelayFirstProbeTime = 5 * time.Second + defaultMaxMulticastProbes = 3 + defaultMaxUnicastProbes = 3 + defaultMaxAnycastDelayTime = time.Second + defaultMaxReachbilityConfirmations = 3 + defaultUnreachableTime = 5 * time.Second + + defaultFakeRandomNum = 0.5 +) + +// fakeRand is a deterministic random number generator. +type fakeRand struct { + num float32 +} + +var _ stack.Rand = (*fakeRand)(nil) + +func (f *fakeRand) Float32() float32 { + return f.num +} + +// TestSetNUDConfigurationFailsForBadNICID tests to make sure we get an error if +// we attempt to update NUD configurations using an invalid NICID. +func TestSetNUDConfigurationFailsForBadNICID(t *testing.T) { + s := stack.New(stack.Options{ + // A neighbor cache is required to store NUDConfigurations. The networking + // stack will only allocate neighbor caches if a protocol providing link + // address resolution is specified (e.g. ARP or IPv6). + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + }) + + // No NIC with ID 1 yet. + config := stack.NUDConfigurations{} + if err := s.SetNUDConfigurations(1, config); err != tcpip.ErrUnknownNICID { + t.Fatalf("got s.SetNDPConfigurations(1, %+v) = %v, want = %s", config, err, tcpip.ErrUnknownNICID) + } +} + +// TestNUDConfigurationFailsForNotSupported tests to make sure we get a +// NotSupported error if we attempt to retrieve NUD configurations when the +// stack doesn't support NUD. +// +// The stack will report to not support NUD if a neighbor cache for a given NIC +// is not allocated. The networking stack will only allocate neighbor caches if +// a protocol providing link address resolution is specified (e.g. ARP, IPv6). +func TestNUDConfigurationFailsForNotSupported(t *testing.T) { + const nicID = 1 + + e := channel.New(0, 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + + s := stack.New(stack.Options{ + NUDConfigs: stack.DefaultNUDConfigurations(), + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + if _, err := s.NUDConfigurations(nicID); err != tcpip.ErrNotSupported { + t.Fatalf("got s.NDPConfigurations(%d) = %v, want = %s", nicID, err, tcpip.ErrNotSupported) + } +} + +// TestNUDConfigurationFailsForNotSupported tests to make sure we get a +// NotSupported error if we attempt to set NUD configurations when the stack +// doesn't support NUD. +// +// The stack will report to not support NUD if a neighbor cache for a given NIC +// is not allocated. The networking stack will only allocate neighbor caches if +// a protocol providing link address resolution is specified (e.g. ARP, IPv6). +func TestSetNUDConfigurationFailsForNotSupported(t *testing.T) { + const nicID = 1 + + e := channel.New(0, 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + + s := stack.New(stack.Options{ + NUDConfigs: stack.DefaultNUDConfigurations(), + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + + config := stack.NUDConfigurations{} + if err := s.SetNUDConfigurations(nicID, config); err != tcpip.ErrNotSupported { + t.Fatalf("got s.SetNDPConfigurations(%d, %+v) = %v, want = %s", nicID, config, err, tcpip.ErrNotSupported) + } +} + +// TestDefaultNUDConfigurationIsValid verifies that calling +// resetInvalidFields() on the result of DefaultNUDConfigurations() does not +// change anything. DefaultNUDConfigurations() should return a valid +// NUDConfigurations. +func TestDefaultNUDConfigurations(t *testing.T) { + const nicID = 1 + + e := channel.New(0, 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + + s := stack.New(stack.Options{ + // A neighbor cache is required to store NUDConfigurations. The networking + // stack will only allocate neighbor caches if a protocol providing link + // address resolution is specified (e.g. ARP or IPv6). + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NUDConfigs: stack.DefaultNUDConfigurations(), + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + c, err := s.NUDConfigurations(nicID) + if err != nil { + t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err) + } + if got, want := c, stack.DefaultNUDConfigurations(); got != want { + t.Errorf("got stack.NUDConfigurations(%d) = %+v, want = %+v", nicID, got, want) + } +} + +func TestNUDConfigurationsBaseReachableTime(t *testing.T) { + tests := []struct { + name string + baseReachableTime time.Duration + want time.Duration + }{ + // Invalid cases + { + name: "EqualToZero", + baseReachableTime: 0, + want: defaultBaseReachableTime, + }, + // Valid cases + { + name: "MoreThanZero", + baseReachableTime: time.Millisecond, + want: time.Millisecond, + }, + { + name: "MoreThanDefaultBaseReachableTime", + baseReachableTime: 2 * defaultBaseReachableTime, + want: 2 * defaultBaseReachableTime, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + const nicID = 1 + + c := stack.DefaultNUDConfigurations() + c.BaseReachableTime = test.baseReachableTime + + e := channel.New(0, 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + + s := stack.New(stack.Options{ + // A neighbor cache is required to store NUDConfigurations. The + // networking stack will only allocate neighbor caches if a protocol + // providing link address resolution is specified (e.g. ARP or IPv6). + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NUDConfigs: c, + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + sc, err := s.NUDConfigurations(nicID) + if err != nil { + t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err) + } + if got := sc.BaseReachableTime; got != test.want { + t.Errorf("got BaseReachableTime = %q, want = %q", got, test.want) + } + }) + } +} + +func TestNUDConfigurationsMinRandomFactor(t *testing.T) { + tests := []struct { + name string + minRandomFactor float32 + want float32 + }{ + // Invalid cases + { + name: "LessThanZero", + minRandomFactor: -1, + want: defaultMinRandomFactor, + }, + { + name: "EqualToZero", + minRandomFactor: 0, + want: defaultMinRandomFactor, + }, + // Valid cases + { + name: "MoreThanZero", + minRandomFactor: 1, + want: 1, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + const nicID = 1 + + c := stack.DefaultNUDConfigurations() + c.MinRandomFactor = test.minRandomFactor + + e := channel.New(0, 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + + s := stack.New(stack.Options{ + // A neighbor cache is required to store NUDConfigurations. The + // networking stack will only allocate neighbor caches if a protocol + // providing link address resolution is specified (e.g. ARP or IPv6). + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NUDConfigs: c, + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + sc, err := s.NUDConfigurations(nicID) + if err != nil { + t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err) + } + if got := sc.MinRandomFactor; got != test.want { + t.Errorf("got MinRandomFactor = %f, want = %f", got, test.want) + } + }) + } +} + +func TestNUDConfigurationsMaxRandomFactor(t *testing.T) { + tests := []struct { + name string + minRandomFactor float32 + maxRandomFactor float32 + want float32 + }{ + // Invalid cases + { + name: "LessThanZero", + minRandomFactor: defaultMinRandomFactor, + maxRandomFactor: -1, + want: defaultMaxRandomFactor, + }, + { + name: "EqualToZero", + minRandomFactor: defaultMinRandomFactor, + maxRandomFactor: 0, + want: defaultMaxRandomFactor, + }, + { + name: "LessThanMinRandomFactor", + minRandomFactor: defaultMinRandomFactor, + maxRandomFactor: defaultMinRandomFactor * 0.99, + want: defaultMaxRandomFactor, + }, + { + name: "MoreThanMinRandomFactorWhenMinRandomFactorIsLargerThanMaxRandomFactorDefault", + minRandomFactor: defaultMaxRandomFactor * 2, + maxRandomFactor: defaultMaxRandomFactor, + want: defaultMaxRandomFactor * 6, + }, + // Valid cases + { + name: "EqualToMinRandomFactor", + minRandomFactor: defaultMinRandomFactor, + maxRandomFactor: defaultMinRandomFactor, + want: defaultMinRandomFactor, + }, + { + name: "MoreThanMinRandomFactor", + minRandomFactor: defaultMinRandomFactor, + maxRandomFactor: defaultMinRandomFactor * 1.1, + want: defaultMinRandomFactor * 1.1, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + const nicID = 1 + + c := stack.DefaultNUDConfigurations() + c.MinRandomFactor = test.minRandomFactor + c.MaxRandomFactor = test.maxRandomFactor + + e := channel.New(0, 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + + s := stack.New(stack.Options{ + // A neighbor cache is required to store NUDConfigurations. The + // networking stack will only allocate neighbor caches if a protocol + // providing link address resolution is specified (e.g. ARP or IPv6). + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NUDConfigs: c, + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + sc, err := s.NUDConfigurations(nicID) + if err != nil { + t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err) + } + if got := sc.MaxRandomFactor; got != test.want { + t.Errorf("got MaxRandomFactor = %f, want = %f", got, test.want) + } + }) + } +} + +func TestNUDConfigurationsRetransmitTimer(t *testing.T) { + tests := []struct { + name string + retransmitTimer time.Duration + want time.Duration + }{ + // Invalid cases + { + name: "EqualToZero", + retransmitTimer: 0, + want: defaultRetransmitTimer, + }, + { + name: "LessThanMinimumRetransmitTimer", + retransmitTimer: minimumRetransmitTimer - time.Nanosecond, + want: defaultRetransmitTimer, + }, + // Valid cases + { + name: "EqualToMinimumRetransmitTimer", + retransmitTimer: minimumRetransmitTimer, + want: minimumBaseReachableTime, + }, + { + name: "LargetThanMinimumRetransmitTimer", + retransmitTimer: 2 * minimumBaseReachableTime, + want: 2 * minimumBaseReachableTime, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + const nicID = 1 + + c := stack.DefaultNUDConfigurations() + c.RetransmitTimer = test.retransmitTimer + + e := channel.New(0, 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + + s := stack.New(stack.Options{ + // A neighbor cache is required to store NUDConfigurations. The + // networking stack will only allocate neighbor caches if a protocol + // providing link address resolution is specified (e.g. ARP or IPv6). + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NUDConfigs: c, + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + sc, err := s.NUDConfigurations(nicID) + if err != nil { + t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err) + } + if got := sc.RetransmitTimer; got != test.want { + t.Errorf("got RetransmitTimer = %q, want = %q", got, test.want) + } + }) + } +} + +func TestNUDConfigurationsDelayFirstProbeTime(t *testing.T) { + tests := []struct { + name string + delayFirstProbeTime time.Duration + want time.Duration + }{ + // Invalid cases + { + name: "EqualToZero", + delayFirstProbeTime: 0, + want: defaultDelayFirstProbeTime, + }, + // Valid cases + { + name: "MoreThanZero", + delayFirstProbeTime: time.Millisecond, + want: time.Millisecond, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + const nicID = 1 + + c := stack.DefaultNUDConfigurations() + c.DelayFirstProbeTime = test.delayFirstProbeTime + + e := channel.New(0, 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + + s := stack.New(stack.Options{ + // A neighbor cache is required to store NUDConfigurations. The + // networking stack will only allocate neighbor caches if a protocol + // providing link address resolution is specified (e.g. ARP or IPv6). + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NUDConfigs: c, + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + sc, err := s.NUDConfigurations(nicID) + if err != nil { + t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err) + } + if got := sc.DelayFirstProbeTime; got != test.want { + t.Errorf("got DelayFirstProbeTime = %q, want = %q", got, test.want) + } + }) + } +} + +func TestNUDConfigurationsMaxMulticastProbes(t *testing.T) { + tests := []struct { + name string + maxMulticastProbes uint32 + want uint32 + }{ + // Invalid cases + { + name: "EqualToZero", + maxMulticastProbes: 0, + want: defaultMaxMulticastProbes, + }, + // Valid cases + { + name: "MoreThanZero", + maxMulticastProbes: 1, + want: 1, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + const nicID = 1 + + c := stack.DefaultNUDConfigurations() + c.MaxMulticastProbes = test.maxMulticastProbes + + e := channel.New(0, 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + + s := stack.New(stack.Options{ + // A neighbor cache is required to store NUDConfigurations. The + // networking stack will only allocate neighbor caches if a protocol + // providing link address resolution is specified (e.g. ARP or IPv6). + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NUDConfigs: c, + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + sc, err := s.NUDConfigurations(nicID) + if err != nil { + t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err) + } + if got := sc.MaxMulticastProbes; got != test.want { + t.Errorf("got MaxMulticastProbes = %q, want = %q", got, test.want) + } + }) + } +} + +func TestNUDConfigurationsMaxUnicastProbes(t *testing.T) { + tests := []struct { + name string + maxUnicastProbes uint32 + want uint32 + }{ + // Invalid cases + { + name: "EqualToZero", + maxUnicastProbes: 0, + want: defaultMaxUnicastProbes, + }, + // Valid cases + { + name: "MoreThanZero", + maxUnicastProbes: 1, + want: 1, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + const nicID = 1 + + c := stack.DefaultNUDConfigurations() + c.MaxUnicastProbes = test.maxUnicastProbes + + e := channel.New(0, 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + + s := stack.New(stack.Options{ + // A neighbor cache is required to store NUDConfigurations. The + // networking stack will only allocate neighbor caches if a protocol + // providing link address resolution is specified (e.g. ARP or IPv6). + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NUDConfigs: c, + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + sc, err := s.NUDConfigurations(nicID) + if err != nil { + t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err) + } + if got := sc.MaxUnicastProbes; got != test.want { + t.Errorf("got MaxUnicastProbes = %q, want = %q", got, test.want) + } + }) + } +} + +func TestNUDConfigurationsUnreachableTime(t *testing.T) { + tests := []struct { + name string + unreachableTime time.Duration + want time.Duration + }{ + // Invalid cases + { + name: "EqualToZero", + unreachableTime: 0, + want: defaultUnreachableTime, + }, + // Valid cases + { + name: "MoreThanZero", + unreachableTime: time.Millisecond, + want: time.Millisecond, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + const nicID = 1 + + c := stack.DefaultNUDConfigurations() + c.UnreachableTime = test.unreachableTime + + e := channel.New(0, 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + + s := stack.New(stack.Options{ + // A neighbor cache is required to store NUDConfigurations. The + // networking stack will only allocate neighbor caches if a protocol + // providing link address resolution is specified (e.g. ARP or IPv6). + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NUDConfigs: c, + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + sc, err := s.NUDConfigurations(nicID) + if err != nil { + t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err) + } + if got := sc.UnreachableTime; got != test.want { + t.Errorf("got UnreachableTime = %q, want = %q", got, test.want) + } + }) + } +} + +// TestNUDStateReachableTime verifies the correctness of the ReachableTime +// computation. +func TestNUDStateReachableTime(t *testing.T) { + tests := []struct { + name string + baseReachableTime time.Duration + minRandomFactor float32 + maxRandomFactor float32 + want time.Duration + }{ + { + name: "AllZeros", + baseReachableTime: 0, + minRandomFactor: 0, + maxRandomFactor: 0, + want: 0, + }, + { + name: "ZeroMaxRandomFactor", + baseReachableTime: time.Second, + minRandomFactor: 0, + maxRandomFactor: 0, + want: 0, + }, + { + name: "ZeroMinRandomFactor", + baseReachableTime: time.Second, + minRandomFactor: 0, + maxRandomFactor: 1, + want: time.Duration(defaultFakeRandomNum * float32(time.Second)), + }, + { + name: "FractionalRandomFactor", + baseReachableTime: time.Duration(math.MaxInt64), + minRandomFactor: 0.001, + maxRandomFactor: 0.002, + want: time.Duration((0.001 + (0.001 * defaultFakeRandomNum)) * float32(math.MaxInt64)), + }, + { + name: "MinAndMaxRandomFactorsEqual", + baseReachableTime: time.Second, + minRandomFactor: 1, + maxRandomFactor: 1, + want: time.Second, + }, + { + name: "MinAndMaxRandomFactorsDifferent", + baseReachableTime: time.Second, + minRandomFactor: 1, + maxRandomFactor: 2, + want: time.Duration((1.0 + defaultFakeRandomNum) * float32(time.Second)), + }, + { + name: "MaxInt64", + baseReachableTime: time.Duration(math.MaxInt64), + minRandomFactor: 1, + maxRandomFactor: 1, + want: time.Duration(math.MaxInt64), + }, + { + name: "Overflow", + baseReachableTime: time.Duration(math.MaxInt64), + minRandomFactor: 1.5, + maxRandomFactor: 1.5, + want: time.Duration(math.MaxInt64), + }, + { + name: "DoubleOverflow", + baseReachableTime: time.Duration(math.MaxInt64), + minRandomFactor: 2.5, + maxRandomFactor: 2.5, + want: time.Duration(math.MaxInt64), + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + c := stack.NUDConfigurations{ + BaseReachableTime: test.baseReachableTime, + MinRandomFactor: test.minRandomFactor, + MaxRandomFactor: test.maxRandomFactor, + } + // A fake random number generator is used to ensure deterministic + // results. + rng := fakeRand{ + num: defaultFakeRandomNum, + } + s := stack.NewNUDState(c, &rng) + if got, want := s.ReachableTime(), test.want; got != want { + t.Errorf("got ReachableTime = %q, want = %q", got, want) + } + }) + } +} + +// TestNUDStateRecomputeReachableTime exercises the ReachableTime function +// twice to verify recomputation of reachable time when the min random factor, +// max random factor, or base reachable time changes. +func TestNUDStateRecomputeReachableTime(t *testing.T) { + const defaultBase = time.Second + const defaultMin = 2.0 * defaultMaxRandomFactor + const defaultMax = 3.0 * defaultMaxRandomFactor + + tests := []struct { + name string + baseReachableTime time.Duration + minRandomFactor float32 + maxRandomFactor float32 + want time.Duration + }{ + { + name: "BaseReachableTime", + baseReachableTime: 2 * defaultBase, + minRandomFactor: defaultMin, + maxRandomFactor: defaultMax, + want: time.Duration((defaultMin + (defaultMax-defaultMin)*defaultFakeRandomNum) * float32(2*defaultBase)), + }, + { + name: "MinRandomFactor", + baseReachableTime: defaultBase, + minRandomFactor: defaultMax, + maxRandomFactor: defaultMax, + want: time.Duration(defaultMax * float32(defaultBase)), + }, + { + name: "MaxRandomFactor", + baseReachableTime: defaultBase, + minRandomFactor: defaultMin, + maxRandomFactor: defaultMin, + want: time.Duration(defaultMin * float32(defaultBase)), + }, + { + name: "BothRandomFactor", + baseReachableTime: defaultBase, + minRandomFactor: 2 * defaultMin, + maxRandomFactor: 2 * defaultMax, + want: time.Duration((2*defaultMin + (2*defaultMax-2*defaultMin)*defaultFakeRandomNum) * float32(defaultBase)), + }, + { + name: "BaseReachableTimeAndBothRandomFactors", + baseReachableTime: 2 * defaultBase, + minRandomFactor: 2 * defaultMin, + maxRandomFactor: 2 * defaultMax, + want: time.Duration((2*defaultMin + (2*defaultMax-2*defaultMin)*defaultFakeRandomNum) * float32(2*defaultBase)), + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + c := stack.DefaultNUDConfigurations() + c.BaseReachableTime = defaultBase + c.MinRandomFactor = defaultMin + c.MaxRandomFactor = defaultMax + + // A fake random number generator is used to ensure deterministic + // results. + rng := fakeRand{ + num: defaultFakeRandomNum, + } + s := stack.NewNUDState(c, &rng) + old := s.ReachableTime() + + if got, want := s.ReachableTime(), old; got != want { + t.Errorf("got ReachableTime = %q, want = %q", got, want) + } + + // Check for recomputation when changing the min random factor, the max + // random factor, the base reachability time, or any permutation of those + // three options. + c.BaseReachableTime = test.baseReachableTime + c.MinRandomFactor = test.minRandomFactor + c.MaxRandomFactor = test.maxRandomFactor + s.SetConfig(c) + + if got, want := s.ReachableTime(), test.want; got != want { + t.Errorf("got ReachableTime = %q, want = %q", got, want) + } + + // Verify that ReachableTime isn't recomputed when none of the + // configuration options change. The random factor is changed so that if + // a recompution were to occur, ReachableTime would change. + rng.num = defaultFakeRandomNum / 2.0 + if got, want := s.ReachableTime(), test.want; got != want { + t.Errorf("got ReachableTime = %q, want = %q", got, want) + } + }) + } +} diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go index d65f8049e..91e0110f1 100644 --- a/pkg/tcpip/stack/route.go +++ b/pkg/tcpip/stack/route.go @@ -48,6 +48,10 @@ type Route struct { // Loop controls where WritePacket should send packets. Loop PacketLooping + + // directedBroadcast indicates whether this route is sending a directed + // broadcast packet. + directedBroadcast bool } // makeRoute initializes a new route. It takes ownership of the provided @@ -275,6 +279,12 @@ func (r *Route) Stack() *Stack { return r.ref.stack() } +// IsBroadcast returns true if the route is to send a broadcast packet. +func (r *Route) IsBroadcast() bool { + // Only IPv4 has a notion of broadcast. + return r.directedBroadcast || r.RemoteAddress == header.IPv4Broadcast +} + // ReverseRoute returns new route with given source and destination address. func (r *Route) ReverseRoute(src tcpip.Address, dst tcpip.Address) Route { return Route{ diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index a6faa22c2..5b19c5d59 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -73,6 +73,16 @@ type TCPCubicState struct { WEst float64 } +// TCPRACKState is used to hold a copy of the internal RACK state when the +// TCPProbeFunc is invoked. +type TCPRACKState struct { + XmitTime time.Time + EndSequence seqnum.Value + FACK seqnum.Value + RTT time.Duration + Reord bool +} + // TCPEndpointID is the unique 4 tuple that identifies a given endpoint. type TCPEndpointID struct { // LocalPort is the local port associated with the endpoint. @@ -212,6 +222,9 @@ type TCPSenderState struct { // Cubic holds the state related to CUBIC congestion control. Cubic TCPCubicState + + // RACKState holds the state related to RACK loss detection algorithm. + RACKState TCPRACKState } // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint. @@ -445,6 +458,9 @@ type Stack struct { // ndpConfigs is the default NDP configurations used by interfaces. ndpConfigs NDPConfigurations + // nudConfigs is the default NUD configurations used by interfaces. + nudConfigs NUDConfigurations + // autoGenIPv6LinkLocal determines whether or not the stack will attempt // to auto-generate an IPv6 link-local address for newly enabled non-loopback // NICs. See the AutoGenIPv6LinkLocal field of Options for more details. @@ -454,6 +470,10 @@ type Stack struct { // integrator NDP related events. ndpDisp NDPDispatcher + // nudDisp is the NUD event dispatcher that is used to send the netstack + // integrator NUD related events. + nudDisp NUDDispatcher + // uniqueIDGenerator is a generator of unique identifiers. uniqueIDGenerator UniqueID @@ -518,6 +538,9 @@ type Options struct { // before assigning an address to a NIC. NDPConfigs NDPConfigurations + // NUDConfigs is the default NUD configurations used by interfaces. + NUDConfigs NUDConfigurations + // AutoGenIPv6LinkLocal determines whether or not the stack will attempt to // auto-generate an IPv6 link-local address for newly enabled non-loopback // NICs. @@ -536,6 +559,10 @@ type Options struct { // receive NDP related events. NDPDisp NDPDispatcher + // NUDDisp is the NUD event dispatcher that an integrator can provide to + // receive NUD related events. + NUDDisp NUDDispatcher + // RawFactory produces raw endpoints. Raw endpoints are enabled only if // this is non-nil. RawFactory RawFactory @@ -670,6 +697,8 @@ func New(opts Options) *Stack { // Make sure opts.NDPConfigs contains valid values only. opts.NDPConfigs.validate() + opts.NUDConfigs.resetInvalidFields() + s := &Stack{ transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState), networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol), @@ -685,9 +714,11 @@ func New(opts Options) *Stack { icmpRateLimiter: NewICMPRateLimiter(), seed: generateRandUint32(), ndpConfigs: opts.NDPConfigs, + nudConfigs: opts.NUDConfigs, autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal, uniqueIDGenerator: opts.UniqueID, ndpDisp: opts.NDPDisp, + nudDisp: opts.NUDDisp, opaqueIIDOpts: opts.OpaqueIIDOpts, tempIIDSeed: opts.TempIIDSeed, forwarder: newForwardQueue(), @@ -1284,9 +1315,9 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n s.mu.RLock() defer s.mu.RUnlock() - isBroadcast := remoteAddr == header.IPv4Broadcast + isLocalBroadcast := remoteAddr == header.IPv4Broadcast isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr) - needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr)) + needRoute := !(isLocalBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr)) if id != 0 && !needRoute { if nic, ok := s.nics[id]; ok && nic.enabled() { if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil { @@ -1307,9 +1338,16 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n } r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()) - if needRoute { - r.NextHop = route.Gateway + r.directedBroadcast = route.Destination.IsBroadcast(remoteAddr) + + if len(route.Gateway) > 0 { + if needRoute { + r.NextHop = route.Gateway + } + } else if r.directedBroadcast { + r.RemoteLinkAddress = header.EthernetBroadcastAddress } + return r, nil } } @@ -1862,10 +1900,38 @@ func (s *Stack) SetNDPConfigurations(id tcpip.NICID, c NDPConfigurations) *tcpip } nic.setNDPConfigs(c) - return nil } +// NUDConfigurations gets the per-interface NUD configurations. +func (s *Stack) NUDConfigurations(id tcpip.NICID) (NUDConfigurations, *tcpip.Error) { + s.mu.RLock() + nic, ok := s.nics[id] + s.mu.RUnlock() + + if !ok { + return NUDConfigurations{}, tcpip.ErrUnknownNICID + } + + return nic.NUDConfigs() +} + +// SetNUDConfigurations sets the per-interface NUD configurations. +// +// Note, if c contains invalid NUD configuration values, it will be fixed to +// use default values for the erroneous values. +func (s *Stack) SetNUDConfigurations(id tcpip.NICID, c NUDConfigurations) *tcpip.Error { + s.mu.RLock() + nic, ok := s.nics[id] + s.mu.RUnlock() + + if !ok { + return tcpip.ErrUnknownNICID + } + + return nic.setNUDConfigs(c) +} + // HandleNDPRA provides a NIC with ID id a validated NDP Router Advertisement // message that it needs to handle. func (s *Stack) HandleNDPRA(id tcpip.NICID, ip tcpip.Address, ra header.NDPRouterAdvert) *tcpip.Error { @@ -1919,8 +1985,8 @@ func generateRandInt64() int64 { // FindNetworkEndpoint returns the network endpoint for the given address. func (s *Stack) FindNetworkEndpoint(netProto tcpip.NetworkProtocolNumber, address tcpip.Address) (NetworkEndpoint, *tcpip.Error) { - s.mu.Lock() - defer s.mu.Unlock() + s.mu.RLock() + defer s.mu.RUnlock() for _, nic := range s.nics { id := NetworkEndpointID{address} @@ -1939,8 +2005,8 @@ func (s *Stack) FindNetworkEndpoint(netProto tcpip.NetworkProtocolNumber, addres // FindNICNameFromID returns the name of the nic for the given NICID. func (s *Stack) FindNICNameFromID(id tcpip.NICID) string { - s.mu.Lock() - defer s.mu.Unlock() + s.mu.RLock() + defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index 101ca2206..0b6deda02 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -27,6 +27,7 @@ import ( "time" "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" @@ -276,6 +277,17 @@ func (l *linkEPWithMockedAttach) isAttached() bool { return l.attached } +// Checks to see if list contains an address. +func containsAddr(list []tcpip.ProtocolAddress, item tcpip.ProtocolAddress) bool { + for _, i := range list { + if i == item { + return true + } + } + + return false +} + func TestNetworkReceive(t *testing.T) { // Create a stack with the fake network protocol, one nic, and two // addresses attached to it: 1 & 2. @@ -1703,7 +1715,7 @@ func testNicForAddressRange(t *testing.T, nicID tcpip.NICID, s *stack.Stack, sub // Trying the next address should always fail since it is outside the range. if gotNicID := s.CheckLocalAddress(0, fakeNetNumber, tcpip.Address(addrBytes)); gotNicID != 0 { - t.Errorf("got CheckLocalAddress(0, %d, %s) = %d, want = %d", fakeNetNumber, tcpip.Address(addrBytes), gotNicID, 0) + t.Errorf("got CheckLocalAddress(0, %d, %s) = %d, want = 0", fakeNetNumber, tcpip.Address(addrBytes), gotNicID) } } @@ -3088,6 +3100,13 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) { func TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable(t *testing.T) { const nicID = 1 + broadcastAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: header.IPv4Broadcast, + PrefixLen: 32, + }, + } e := loopback.New() s := stack.New(stack.Options{ @@ -3098,49 +3117,41 @@ func TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable(t *testing.T) { t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err) } - allStackAddrs := s.AllAddresses() - allNICAddrs, ok := allStackAddrs[nicID] - if !ok { - t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs) - } - if l := len(allNICAddrs); l != 0 { - t.Fatalf("got len(allNICAddrs) = %d, want = 0", l) + { + allStackAddrs := s.AllAddresses() + if allNICAddrs, ok := allStackAddrs[nicID]; !ok { + t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs) + } else if containsAddr(allNICAddrs, broadcastAddr) { + t.Fatalf("got allNICAddrs = %+v, don't want = %+v", allNICAddrs, broadcastAddr) + } } // Enabling the NIC should add the IPv4 broadcast address. if err := s.EnableNIC(nicID); err != nil { t.Fatalf("s.EnableNIC(%d): %s", nicID, err) } - allStackAddrs = s.AllAddresses() - allNICAddrs, ok = allStackAddrs[nicID] - if !ok { - t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs) - } - if l := len(allNICAddrs); l != 1 { - t.Fatalf("got len(allNICAddrs) = %d, want = 1", l) - } - want := tcpip.ProtocolAddress{ - Protocol: header.IPv4ProtocolNumber, - AddressWithPrefix: tcpip.AddressWithPrefix{ - Address: header.IPv4Broadcast, - PrefixLen: 32, - }, - } - if allNICAddrs[0] != want { - t.Fatalf("got allNICAddrs[0] = %+v, want = %+v", allNICAddrs[0], want) + + { + allStackAddrs := s.AllAddresses() + if allNICAddrs, ok := allStackAddrs[nicID]; !ok { + t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs) + } else if !containsAddr(allNICAddrs, broadcastAddr) { + t.Fatalf("got allNICAddrs = %+v, want = %+v", allNICAddrs, broadcastAddr) + } } // Disabling the NIC should remove the IPv4 broadcast address. if err := s.DisableNIC(nicID); err != nil { t.Fatalf("s.DisableNIC(%d): %s", nicID, err) } - allStackAddrs = s.AllAddresses() - allNICAddrs, ok = allStackAddrs[nicID] - if !ok { - t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs) - } - if l := len(allNICAddrs); l != 0 { - t.Fatalf("got len(allNICAddrs) = %d, want = 0", l) + + { + allStackAddrs := s.AllAddresses() + if allNICAddrs, ok := allStackAddrs[nicID]; !ok { + t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs) + } else if containsAddr(allNICAddrs, broadcastAddr) { + t.Fatalf("got allNICAddrs = %+v, don't want = %+v", allNICAddrs, broadcastAddr) + } } } @@ -3188,50 +3199,93 @@ func TestLeaveIPv6SolicitedNodeAddrBeforeAddrRemoval(t *testing.T) { } } -func TestJoinLeaveAllNodesMulticastOnNICEnableDisable(t *testing.T) { +func TestJoinLeaveMulticastOnNICEnableDisable(t *testing.T) { const nicID = 1 - e := loopback.New() - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, - }) - nicOpts := stack.NICOptions{Disabled: true} - if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil { - t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err) + tests := []struct { + name string + proto tcpip.NetworkProtocolNumber + addr tcpip.Address + }{ + { + name: "IPv6 All-Nodes", + proto: header.IPv6ProtocolNumber, + addr: header.IPv6AllNodesMulticastAddress, + }, + { + name: "IPv4 All-Systems", + proto: header.IPv4ProtocolNumber, + addr: header.IPv4AllSystems, + }, } - // Should not be in the IPv6 all-nodes multicast group yet because the NIC has - // not been enabled yet. - isInGroup, err := s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress) - if err != nil { - t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err) - } - if isInGroup { - t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress) - } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + e := loopback.New() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()}, + }) + nicOpts := stack.NICOptions{Disabled: true} + if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil { + t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err) + } - // The all-nodes multicast group should be joined when the NIC is enabled. - if err := s.EnableNIC(nicID); err != nil { - t.Fatalf("s.EnableNIC(%d): %s", nicID, err) - } - isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress) - if err != nil { - t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err) - } - if !isInGroup { - t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, header.IPv6AllNodesMulticastAddress) - } + // Should not be in the multicast group yet because the NIC has not been + // enabled yet. + if isInGroup, err := s.IsInGroup(nicID, test.addr); err != nil { + t.Fatalf("IsInGroup(%d, %s): %s", nicID, test.addr, err) + } else if isInGroup { + t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, test.addr) + } - // The all-nodes multicast group should be left when the NIC is disabled. - if err := s.DisableNIC(nicID); err != nil { - t.Fatalf("s.DisableNIC(%d): %s", nicID, err) - } - isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress) - if err != nil { - t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err) - } - if isInGroup { - t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress) + // The all-nodes multicast group should be joined when the NIC is enabled. + if err := s.EnableNIC(nicID); err != nil { + t.Fatalf("s.EnableNIC(%d): %s", nicID, err) + } + + if isInGroup, err := s.IsInGroup(nicID, test.addr); err != nil { + t.Fatalf("IsInGroup(%d, %s): %s", nicID, test.addr, err) + } else if !isInGroup { + t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, test.addr) + } + + // The multicast group should be left when the NIC is disabled. + if err := s.DisableNIC(nicID); err != nil { + t.Fatalf("s.DisableNIC(%d): %s", nicID, err) + } + + if isInGroup, err := s.IsInGroup(nicID, test.addr); err != nil { + t.Fatalf("IsInGroup(%d, %s): %s", nicID, test.addr, err) + } else if isInGroup { + t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, test.addr) + } + + // The all-nodes multicast group should be joined when the NIC is enabled. + if err := s.EnableNIC(nicID); err != nil { + t.Fatalf("s.EnableNIC(%d): %s", nicID, err) + } + + if isInGroup, err := s.IsInGroup(nicID, test.addr); err != nil { + t.Fatalf("IsInGroup(%d, %s): %s", nicID, test.addr, err) + } else if !isInGroup { + t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, test.addr) + } + + // Leaving the group before disabling the NIC should not cause an error. + if err := s.LeaveGroup(test.proto, nicID, test.addr); err != nil { + t.Fatalf("s.LeaveGroup(%d, %d, %s): %s", test.proto, nicID, test.addr, err) + } + + if err := s.DisableNIC(nicID); err != nil { + t.Fatalf("s.DisableNIC(%d): %s", nicID, err) + } + + if isInGroup, err := s.IsInGroup(nicID, test.addr); err != nil { + t.Fatalf("IsInGroup(%d, %s): %s", nicID, test.addr, err) + } else if isInGroup { + t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, test.addr) + } + }) } } @@ -3418,3 +3472,225 @@ func TestStackSendBufferSizeOption(t *testing.T) { }) } } + +func TestOutgoingSubnetBroadcast(t *testing.T) { + const ( + unspecifiedNICID = 0 + nicID1 = 1 + ) + + defaultAddr := tcpip.AddressWithPrefix{ + Address: header.IPv4Any, + PrefixLen: 0, + } + defaultSubnet := defaultAddr.Subnet() + ipv4Addr := tcpip.AddressWithPrefix{ + Address: "\xc0\xa8\x01\x3a", + PrefixLen: 24, + } + ipv4Subnet := ipv4Addr.Subnet() + ipv4SubnetBcast := ipv4Subnet.Broadcast() + ipv4Gateway := tcpip.Address("\xc0\xa8\x01\x01") + ipv4AddrPrefix31 := tcpip.AddressWithPrefix{ + Address: "\xc0\xa8\x01\x3a", + PrefixLen: 31, + } + ipv4Subnet31 := ipv4AddrPrefix31.Subnet() + ipv4Subnet31Bcast := ipv4Subnet31.Broadcast() + ipv4AddrPrefix32 := tcpip.AddressWithPrefix{ + Address: "\xc0\xa8\x01\x3a", + PrefixLen: 32, + } + ipv4Subnet32 := ipv4AddrPrefix32.Subnet() + ipv4Subnet32Bcast := ipv4Subnet32.Broadcast() + ipv6Addr := tcpip.AddressWithPrefix{ + Address: "\x20\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01", + PrefixLen: 64, + } + ipv6Subnet := ipv6Addr.Subnet() + ipv6SubnetBcast := ipv6Subnet.Broadcast() + remNetAddr := tcpip.AddressWithPrefix{ + Address: "\x64\x0a\x7b\x18", + PrefixLen: 24, + } + remNetSubnet := remNetAddr.Subnet() + remNetSubnetBcast := remNetSubnet.Broadcast() + + tests := []struct { + name string + nicAddr tcpip.ProtocolAddress + routes []tcpip.Route + remoteAddr tcpip.Address + expectedRoute stack.Route + }{ + // Broadcast to a locally attached subnet populates the broadcast MAC. + { + name: "IPv4 Broadcast to local subnet", + nicAddr: tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: ipv4Addr, + }, + routes: []tcpip.Route{ + { + Destination: ipv4Subnet, + NIC: nicID1, + }, + }, + remoteAddr: ipv4SubnetBcast, + expectedRoute: stack.Route{ + LocalAddress: ipv4Addr.Address, + RemoteAddress: ipv4SubnetBcast, + RemoteLinkAddress: header.EthernetBroadcastAddress, + NetProto: header.IPv4ProtocolNumber, + Loop: stack.PacketOut, + }, + }, + // Broadcast to a locally attached /31 subnet does not populate the + // broadcast MAC. + { + name: "IPv4 Broadcast to local /31 subnet", + nicAddr: tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: ipv4AddrPrefix31, + }, + routes: []tcpip.Route{ + { + Destination: ipv4Subnet31, + NIC: nicID1, + }, + }, + remoteAddr: ipv4Subnet31Bcast, + expectedRoute: stack.Route{ + LocalAddress: ipv4AddrPrefix31.Address, + RemoteAddress: ipv4Subnet31Bcast, + NetProto: header.IPv4ProtocolNumber, + Loop: stack.PacketOut, + }, + }, + // Broadcast to a locally attached /32 subnet does not populate the + // broadcast MAC. + { + name: "IPv4 Broadcast to local /32 subnet", + nicAddr: tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: ipv4AddrPrefix32, + }, + routes: []tcpip.Route{ + { + Destination: ipv4Subnet32, + NIC: nicID1, + }, + }, + remoteAddr: ipv4Subnet32Bcast, + expectedRoute: stack.Route{ + LocalAddress: ipv4AddrPrefix32.Address, + RemoteAddress: ipv4Subnet32Bcast, + NetProto: header.IPv4ProtocolNumber, + Loop: stack.PacketOut, + }, + }, + // IPv6 has no notion of a broadcast. + { + name: "IPv6 'Broadcast' to local subnet", + nicAddr: tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: ipv6Addr, + }, + routes: []tcpip.Route{ + { + Destination: ipv6Subnet, + NIC: nicID1, + }, + }, + remoteAddr: ipv6SubnetBcast, + expectedRoute: stack.Route{ + LocalAddress: ipv6Addr.Address, + RemoteAddress: ipv6SubnetBcast, + NetProto: header.IPv6ProtocolNumber, + Loop: stack.PacketOut, + }, + }, + // Broadcast to a remote subnet in the route table is send to the next-hop + // gateway. + { + name: "IPv4 Broadcast to remote subnet", + nicAddr: tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: ipv4Addr, + }, + routes: []tcpip.Route{ + { + Destination: remNetSubnet, + Gateway: ipv4Gateway, + NIC: nicID1, + }, + }, + remoteAddr: remNetSubnetBcast, + expectedRoute: stack.Route{ + LocalAddress: ipv4Addr.Address, + RemoteAddress: remNetSubnetBcast, + NextHop: ipv4Gateway, + NetProto: header.IPv4ProtocolNumber, + Loop: stack.PacketOut, + }, + }, + // Broadcast to an unknown subnet follows the default route. Note that this + // is essentially just routing an unknown destination IP, because w/o any + // subnet prefix information a subnet broadcast address is just a normal IP. + { + name: "IPv4 Broadcast to unknown subnet", + nicAddr: tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: ipv4Addr, + }, + routes: []tcpip.Route{ + { + Destination: defaultSubnet, + Gateway: ipv4Gateway, + NIC: nicID1, + }, + }, + remoteAddr: remNetSubnetBcast, + expectedRoute: stack.Route{ + LocalAddress: ipv4Addr.Address, + RemoteAddress: remNetSubnetBcast, + NextHop: ipv4Gateway, + NetProto: header.IPv4ProtocolNumber, + Loop: stack.PacketOut, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()}, + }) + ep := channel.New(0, defaultMTU, "") + if err := s.CreateNIC(nicID1, ep); err != nil { + t.Fatalf("CreateNIC(%d, _): %s", nicID1, err) + } + if err := s.AddProtocolAddress(nicID1, test.nicAddr); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID1, test.nicAddr, err) + } + + s.SetRouteTable(test.routes) + + var netProto tcpip.NetworkProtocolNumber + switch l := len(test.remoteAddr); l { + case header.IPv4AddressSize: + netProto = header.IPv4ProtocolNumber + case header.IPv6AddressSize: + netProto = header.IPv6ProtocolNumber + default: + t.Fatalf("got unexpected address length = %d bytes", l) + } + + if r, err := s.FindRoute(unspecifiedNICID, "" /* localAddr */, test.remoteAddr, netProto, false /* multicastLoop */); err != nil { + t.Fatalf("FindRoute(%d, '', %s, %d): %s", unspecifiedNICID, test.remoteAddr, netProto, err) + } else if diff := cmp.Diff(r, test.expectedRoute, cmpopts.IgnoreUnexported(r)); diff != "" { + t.Errorf("route mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 21aafb0a2..45f59b60f 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -43,6 +43,9 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// Using header.IPv4AddressSize would cause an import cycle. +const ipv4AddressSize = 4 + // Error represents an error in the netstack error space. Using a special type // ensures that errors outside of this space are not accidentally introduced. // @@ -320,6 +323,29 @@ func (s *Subnet) Broadcast() Address { return Address(addr) } +// IsBroadcast returns true if the address is considered a broadcast address. +func (s *Subnet) IsBroadcast(address Address) bool { + // Only IPv4 supports the notion of a broadcast address. + if len(address) != ipv4AddressSize { + return false + } + + // Normally, we would just compare address with the subnet's broadcast + // address but there is an exception where a simple comparison is not + // correct. This exception is for /31 and /32 IPv4 subnets where all + // addresses are considered valid host addresses. + // + // For /31 subnets, the case is easy. RFC 3021 Section 2.1 states that + // both addresses in a /31 subnet "MUST be interpreted as host addresses." + // + // For /32, the case is a bit more vague. RFC 3021 makes no mention of /32 + // subnets. However, the same reasoning applies - if an exception is not + // made, then there do not exist any host addresses in a /32 subnet. RFC + // 4632 Section 3.1 also vaguely implies this interpretation by referring + // to addresses in /32 subnets as "host routes." + return s.Prefix() <= 30 && s.Broadcast() == address +} + // Equal returns true if s equals o. // // Needed to use cmp.Equal on Subnet as its fields are unexported. @@ -928,6 +954,10 @@ type DefaultTTLOption uint8 // classic BPF filter on a given endpoint. type SocketDetachFilterOption int +// OriginalDestinationOption is used to get the original destination address +// and port of a redirected packet. +type OriginalDestinationOption FullAddress + // IPPacketInfo is the message structure for IP_PKTINFO. // // +stateify savable diff --git a/pkg/tcpip/tests/integration/BUILD b/pkg/tcpip/tests/integration/BUILD new file mode 100644 index 000000000..7fff30462 --- /dev/null +++ b/pkg/tcpip/tests/integration/BUILD @@ -0,0 +1,21 @@ +load("//tools:defs.bzl", "go_test") + +package(licenses = ["notice"]) + +go_test( + name = "integration_test", + size = "small", + srcs = ["multicast_broadcast_test.go"], + deps = [ + "//pkg/tcpip", + "//pkg/tcpip/buffer", + "//pkg/tcpip/header", + "//pkg/tcpip/link/channel", + "//pkg/tcpip/network/ipv4", + "//pkg/tcpip/network/ipv6", + "//pkg/tcpip/stack", + "//pkg/tcpip/transport/udp", + "//pkg/waiter", + "@com_github_google_go_cmp//cmp:go_default_library", + ], +) diff --git a/pkg/tcpip/tests/integration/multicast_broadcast_test.go b/pkg/tcpip/tests/integration/multicast_broadcast_test.go new file mode 100644 index 000000000..d9b2d147a --- /dev/null +++ b/pkg/tcpip/tests/integration/multicast_broadcast_test.go @@ -0,0 +1,274 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package integration_test + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/link/channel" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" + "gvisor.dev/gvisor/pkg/waiter" +) + +const defaultMTU = 1280 + +// TestIncomingMulticastAndBroadcast tests receiving a packet destined to some +// multicast or broadcast address. +func TestIncomingMulticastAndBroadcast(t *testing.T) { + const ( + nicID = 1 + remotePort = 5555 + localPort = 80 + ttl = 255 + ) + + data := []byte{1, 2, 3, 4} + + // Local IPv4 subnet: 192.168.1.58/24 + ipv4Addr := tcpip.AddressWithPrefix{ + Address: "\xc0\xa8\x01\x3a", + PrefixLen: 24, + } + ipv4Subnet := ipv4Addr.Subnet() + ipv4SubnetBcast := ipv4Subnet.Broadcast() + + // Local IPv6 subnet: 200a::1/64 + ipv6Addr := tcpip.AddressWithPrefix{ + Address: "\x20\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01", + PrefixLen: 64, + } + ipv6Subnet := ipv6Addr.Subnet() + ipv6SubnetBcast := ipv6Subnet.Broadcast() + + // Remote addrs. + remoteIPv4Addr := tcpip.Address("\x64\x0a\x7b\x18") + remoteIPv6Addr := tcpip.Address("\x20\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02") + + rxIPv4UDP := func(e *channel.Endpoint, dst tcpip.Address) { + payloadLen := header.UDPMinimumSize + len(data) + totalLen := header.IPv4MinimumSize + payloadLen + hdr := buffer.NewPrependable(totalLen) + u := header.UDP(hdr.Prepend(payloadLen)) + u.Encode(&header.UDPFields{ + SrcPort: remotePort, + DstPort: localPort, + Length: uint16(payloadLen), + }) + copy(u.Payload(), data) + sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, remoteIPv4Addr, dst, uint16(payloadLen)) + sum = header.Checksum(data, sum) + u.SetChecksum(^u.CalculateChecksum(sum)) + + ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize)) + ip.Encode(&header.IPv4Fields{ + IHL: header.IPv4MinimumSize, + TotalLength: uint16(totalLen), + Protocol: uint8(udp.ProtocolNumber), + TTL: ttl, + SrcAddr: remoteIPv4Addr, + DstAddr: dst, + }) + + e.InjectInbound(header.IPv4ProtocolNumber, &stack.PacketBuffer{ + Data: hdr.View().ToVectorisedView(), + }) + } + + rxIPv6UDP := func(e *channel.Endpoint, dst tcpip.Address) { + payloadLen := header.UDPMinimumSize + len(data) + hdr := buffer.NewPrependable(header.IPv6MinimumSize + payloadLen) + u := header.UDP(hdr.Prepend(payloadLen)) + u.Encode(&header.UDPFields{ + SrcPort: remotePort, + DstPort: localPort, + Length: uint16(payloadLen), + }) + copy(u.Payload(), data) + sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, remoteIPv6Addr, dst, uint16(payloadLen)) + sum = header.Checksum(data, sum) + u.SetChecksum(^u.CalculateChecksum(sum)) + + ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) + ip.Encode(&header.IPv6Fields{ + PayloadLength: uint16(payloadLen), + NextHeader: uint8(udp.ProtocolNumber), + HopLimit: ttl, + SrcAddr: remoteIPv6Addr, + DstAddr: dst, + }) + + e.InjectInbound(header.IPv6ProtocolNumber, &stack.PacketBuffer{ + Data: hdr.View().ToVectorisedView(), + }) + } + + tests := []struct { + name string + bindAddr tcpip.Address + dstAddr tcpip.Address + expectRx bool + }{ + { + name: "IPv4 unicast binding to unicast", + bindAddr: ipv4Addr.Address, + dstAddr: ipv4Addr.Address, + expectRx: true, + }, + { + name: "IPv4 unicast binding to broadcast", + bindAddr: header.IPv4Broadcast, + dstAddr: ipv4Addr.Address, + expectRx: false, + }, + { + name: "IPv4 unicast binding to wildcard", + dstAddr: ipv4Addr.Address, + expectRx: true, + }, + + { + name: "IPv4 directed broadcast binding to subnet broadcast", + bindAddr: ipv4SubnetBcast, + dstAddr: ipv4SubnetBcast, + expectRx: true, + }, + { + name: "IPv4 directed broadcast binding to broadcast", + bindAddr: header.IPv4Broadcast, + dstAddr: ipv4SubnetBcast, + expectRx: false, + }, + { + name: "IPv4 directed broadcast binding to wildcard", + dstAddr: ipv4SubnetBcast, + expectRx: true, + }, + + { + name: "IPv4 broadcast binding to broadcast", + bindAddr: header.IPv4Broadcast, + dstAddr: header.IPv4Broadcast, + expectRx: true, + }, + { + name: "IPv4 broadcast binding to subnet broadcast", + bindAddr: ipv4SubnetBcast, + dstAddr: header.IPv4Broadcast, + expectRx: false, + }, + { + name: "IPv4 broadcast binding to wildcard", + dstAddr: ipv4SubnetBcast, + expectRx: true, + }, + + { + name: "IPv4 all-systems multicast binding to all-systems multicast", + bindAddr: header.IPv4AllSystems, + dstAddr: header.IPv4AllSystems, + expectRx: true, + }, + { + name: "IPv4 all-systems multicast binding to wildcard", + dstAddr: header.IPv4AllSystems, + expectRx: true, + }, + { + name: "IPv4 all-systems multicast binding to unicast", + bindAddr: ipv4Addr.Address, + dstAddr: header.IPv4AllSystems, + expectRx: false, + }, + + // IPv6 has no notion of a broadcast. + { + name: "IPv6 unicast binding to wildcard", + dstAddr: ipv6Addr.Address, + expectRx: true, + }, + { + name: "IPv6 broadcast-like address binding to wildcard", + dstAddr: ipv6SubnetBcast, + expectRx: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()}, + TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}, + }) + e := channel.New(0, defaultMTU, "") + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _): %s", nicID, err) + } + ipv4ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: ipv4Addr} + if err := s.AddProtocolAddress(nicID, ipv4ProtoAddr); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, ipv4ProtoAddr, err) + } + ipv6ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: ipv6Addr} + if err := s.AddProtocolAddress(nicID, ipv6ProtoAddr); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, ipv6ProtoAddr, err) + } + + var netproto tcpip.NetworkProtocolNumber + var rxUDP func(*channel.Endpoint, tcpip.Address) + switch l := len(test.dstAddr); l { + case header.IPv4AddressSize: + netproto = header.IPv4ProtocolNumber + rxUDP = rxIPv4UDP + case header.IPv6AddressSize: + netproto = header.IPv6ProtocolNumber + rxUDP = rxIPv6UDP + default: + t.Fatalf("got unexpected address length = %d bytes", l) + } + + wq := waiter.Queue{} + ep, err := s.NewEndpoint(udp.ProtocolNumber, netproto, &wq) + if err != nil { + t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, netproto, err) + } + defer ep.Close() + + bindAddr := tcpip.FullAddress{Addr: test.bindAddr, Port: localPort} + if err := ep.Bind(bindAddr); err != nil { + t.Fatalf("ep.Bind(%+v): %s", bindAddr, err) + } + + rxUDP(e, test.dstAddr) + if gotPayload, _, err := ep.Read(nil); test.expectRx { + if err != nil { + t.Fatalf("Read(nil): %s", err) + } + if diff := cmp.Diff(buffer.View(data), gotPayload); diff != "" { + t.Errorf("got UDP payload mismatch (-want +got):\n%s", diff) + } + } else { + if err != tcpip.ErrWouldBlock { + t.Fatalf("got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock) + } + } + }) + } +} diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index e860ee484..234fb95ce 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -40,6 +40,8 @@ go_library( "endpoint_state.go", "forwarder.go", "protocol.go", + "rack.go", + "rack_state.go", "rcv.go", "rcv_state.go", "reno.go", @@ -83,6 +85,7 @@ go_test( "dual_stack_test.go", "sack_scoreboard_test.go", "tcp_noracedetector_test.go", + "tcp_rack_test.go", "tcp_sack_test.go", "tcp_test.go", "tcp_timestamp_test.go", diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 6e00e5526..913ea6535 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -521,7 +521,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) { synOpts := header.TCPSynOptions{ WS: -1, TS: opts.TS, - TSVal: tcpTimeStamp(timeStampOffset()), + TSVal: tcpTimeStamp(time.Now(), timeStampOffset()), TSEcr: opts.TSVal, MSS: mssForRoute(&s.route), } diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 1798510bc..8dd759ba2 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -1024,14 +1024,19 @@ func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) { // delivered to this endpoint from the demuxer when the endpoint // is transitioned to StateClose. func (e *endpoint) transitionToStateCloseLocked() { - if e.EndpointState() == StateClose { + s := e.EndpointState() + if s == StateClose { return } + + if s.connected() { + e.stack.Stats().TCP.CurrentConnected.Decrement() + e.stack.Stats().TCP.EstablishedClosed.Increment() + } + // Mark the endpoint as fully closed for reads/writes. e.cleanupLocked() e.setEndpointState(StateClose) - e.stack.Stats().TCP.CurrentConnected.Decrement() - e.stack.Stats().TCP.EstablishedClosed.Increment() } // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed @@ -1161,13 +1166,18 @@ func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error { return nil } -// handleSegment handles a given segment and notifies the worker goroutine if -// if the connection should be terminated. -func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) { - // Invoke the tcp probe if installed. +func (e *endpoint) probeSegment() { if e.probe != nil { e.probe(e.completeState()) } +} + +// handleSegment handles a given segment and notifies the worker goroutine if +// if the connection should be terminated. +func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) { + // Invoke the tcp probe if installed. The tcp probe function will update + // the TCPEndpointState after the segment is processed. + defer e.probeSegment() if s.flagIsSet(header.TCPFlagRst) { if ok, err := e.handleReset(s); !ok { diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 0f7487963..b8b52b03d 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1777,15 +1777,8 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { // Same as effectively disabling TCPLinger timeout. v = 0 } - var stkTCPLingerTimeout tcpip.TCPLingerTimeoutOption - if err := e.stack.TransportProtocolOption(header.TCPProtocolNumber, &stkTCPLingerTimeout); err != nil { - // We were unable to retrieve a stack config, just use - // the DefaultTCPLingerTimeout. - if v > tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout) { - stkTCPLingerTimeout = tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout) - } - } - // Cap it to the stack wide TCPLinger timeout. + // Cap it to MaxTCPLingerTimeout. + stkTCPLingerTimeout := tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout) if v > stkTCPLingerTimeout { v = stkTCPLingerTimeout } @@ -2017,6 +2010,17 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { *o = tcpip.TCPDeferAcceptOption(e.deferAccept) e.UnlockUser() + case *tcpip.OriginalDestinationOption: + ipt := e.stack.IPTables() + addr, port, err := ipt.OriginalDst(e.ID) + if err != nil { + return err + } + *o = tcpip.OriginalDestinationOption{ + Addr: addr, + Port: port, + } + default: return tcpip.ErrUnknownProtocolOption } @@ -2681,15 +2685,14 @@ func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) { // timestamp returns the timestamp value to be used in the TSVal field of the // timestamp option for outgoing TCP segments for a given endpoint. func (e *endpoint) timestamp() uint32 { - return tcpTimeStamp(e.tsOffset) + return tcpTimeStamp(time.Now(), e.tsOffset) } // tcpTimeStamp returns a timestamp offset by the provided offset. This is // not inlined above as it's used when SYN cookies are in use and endpoint // is not created at the time when the SYN cookie is sent. -func tcpTimeStamp(offset uint32) uint32 { - now := time.Now() - return uint32(now.Unix()*1000+int64(now.Nanosecond()/1e6)) + offset +func tcpTimeStamp(curTime time.Time, offset uint32) uint32 { + return uint32(curTime.Unix()*1000+int64(curTime.Nanosecond()/1e6)) + offset } // timeStampOffset returns a randomized timestamp offset to be used when sending @@ -2832,6 +2835,14 @@ func (e *endpoint) completeState() stack.TCPEndpointState { WEst: cubic.wEst, } } + + rc := e.snd.rc + s.Sender.RACKState = stack.TCPRACKState{ + XmitTime: rc.xmitTime, + EndSequence: rc.endSequence, + FACK: rc.fack, + RTT: rc.rtt, + } return s } diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 5d6174a59..2e5093b36 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -49,7 +49,7 @@ const ( // DefaultReceiveBufferSize is the default size of the receive buffer // for an endpoint. - DefaultReceiveBufferSize = 32 << 10 // 32KB + DefaultReceiveBufferSize = 1 << 20 // 1MB // MaxBufferSize is the largest size a receive/send buffer can grow to. MaxBufferSize = 4 << 20 // 4MB @@ -62,6 +62,10 @@ const ( // FIN_WAIT_2 state before being marked closed. DefaultTCPLingerTimeout = 60 * time.Second + // MaxTCPLingerTimeout is the maximum amount of time that sockets + // linger in FIN_WAIT_2 state before being marked closed. + MaxTCPLingerTimeout = 120 * time.Second + // DefaultTCPTimeWaitTimeout is the amount of time that sockets linger // in TIME_WAIT state before being marked closed. DefaultTCPTimeWaitTimeout = 60 * time.Second @@ -80,6 +84,25 @@ const ( // enable/disable SACK support in TCP. See: https://tools.ietf.org/html/rfc2018. type SACKEnabled bool +// Recovery is used by stack.(*Stack).TransportProtocolOption to +// set loss detection algorithm in TCP. +type Recovery int32 + +const ( + // RACKLossDetection indicates RACK is used for loss detection and + // recovery. + RACKLossDetection Recovery = 1 << iota + + // RACKStaticReoWnd indicates the reordering window should not be + // adjusted when DSACK is received. + RACKStaticReoWnd + + // RACKNoDupTh indicates RACK should not consider the classic three + // duplicate acknowledgements rule to mark the segments as lost. This + // is used when reordering is not detected. + RACKNoDupTh +) + // DelayEnabled is used by stack.(Stack*).TransportProtocolOption to // enable/disable Nagle's algorithm in TCP. type DelayEnabled bool @@ -161,6 +184,7 @@ func (s *synRcvdCounter) Threshold() uint64 { type protocol struct { mu sync.RWMutex sackEnabled bool + recovery Recovery delayEnabled bool sendBufferSize SendBufferSizeOption recvBufferSize ReceiveBufferSizeOption @@ -280,6 +304,12 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error { p.mu.Unlock() return nil + case Recovery: + p.mu.Lock() + p.recovery = Recovery(v) + p.mu.Unlock() + return nil + case DelayEnabled: p.mu.Lock() p.delayEnabled = bool(v) @@ -394,6 +424,12 @@ func (p *protocol) Option(option interface{}) *tcpip.Error { p.mu.RUnlock() return nil + case *Recovery: + p.mu.RLock() + *v = Recovery(p.recovery) + p.mu.RUnlock() + return nil + case *DelayEnabled: p.mu.RLock() *v = DelayEnabled(p.delayEnabled) @@ -535,6 +571,7 @@ func NewProtocol() stack.TransportProtocol { minRTO: MinRTO, maxRTO: MaxRTO, maxRetries: MaxRetries, + recovery: RACKLossDetection, } p.dispatcher.init(runtime.GOMAXPROCS(0)) return &p diff --git a/pkg/tcpip/transport/tcp/rack.go b/pkg/tcpip/transport/tcp/rack.go new file mode 100644 index 000000000..d969ca23a --- /dev/null +++ b/pkg/tcpip/transport/tcp/rack.go @@ -0,0 +1,82 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp + +import ( + "time" + + "gvisor.dev/gvisor/pkg/tcpip/seqnum" +) + +// RACK is a loss detection algorithm used in TCP to detect packet loss and +// reordering using transmission timestamp of the packets instead of packet or +// sequence counts. To use RACK, SACK should be enabled on the connection. + +// rackControl stores the rack related fields. +// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-6.1 +// +// +stateify savable +type rackControl struct { + // xmitTime is the latest transmission timestamp of rackControl.seg. + xmitTime time.Time `state:".(unixTime)"` + + // endSequence is the ending TCP sequence number of rackControl.seg. + endSequence seqnum.Value + + // fack is the highest selectively or cumulatively acknowledged + // sequence. + fack seqnum.Value + + // rtt is the RTT of the most recently delivered packet on the + // connection (either cumulatively acknowledged or selectively + // acknowledged) that was not marked invalid as a possible spurious + // retransmission. + rtt time.Duration +} + +// Update will update the RACK related fields when an ACK has been received. +// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 +func (rc *rackControl) Update(seg *segment, ackSeg *segment, srtt time.Duration, offset uint32) { + rtt := time.Now().Sub(seg.xmitTime) + + // If the ACK is for a retransmitted packet, do not update if it is a + // spurious inference which is determined by below checks: + // 1. When Timestamping option is available, if the TSVal is less than the + // transmit time of the most recent retransmitted packet. + // 2. When RTT calculated for the packet is less than the smoothed RTT + // for the connection. + // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 + // step 2 + if seg.xmitCount > 1 { + if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 { + if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, offset) { + return + } + } + if rtt < srtt { + return + } + } + + rc.rtt = rtt + // Update rc.xmitTime and rc.endSequence to the transmit time and + // ending sequence number of the packet which has been acknowledged + // most recently. + endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) + if rc.xmitTime.Before(seg.xmitTime) || (seg.xmitTime.Equal(rc.xmitTime) && rc.endSequence.LessThan(endSeq)) { + rc.xmitTime = seg.xmitTime + rc.endSequence = endSeq + } +} diff --git a/pkg/tcpip/transport/tcp/rack_state.go b/pkg/tcpip/transport/tcp/rack_state.go new file mode 100644 index 000000000..c9dc7e773 --- /dev/null +++ b/pkg/tcpip/transport/tcp/rack_state.go @@ -0,0 +1,29 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp + +import ( + "time" +) + +// saveXmitTime is invoked by stateify. +func (rc *rackControl) saveXmitTime() unixTime { + return unixTime{rc.xmitTime.Unix(), rc.xmitTime.UnixNano()} +} + +// loadXmitTime is invoked by stateify. +func (rc *rackControl) loadXmitTime(unix unixTime) { + rc.xmitTime = time.Unix(unix.second, unix.nano) +} diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 5862c32f2..c55589c45 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -191,6 +191,10 @@ type sender struct { // cc is the congestion control algorithm in use for this sender. cc congestionControl + + // rc has the fields needed for implementing RACK loss detection + // algorithm. + rc rackControl } // rtt is a synchronization wrapper used to appease stateify. See the comment @@ -1272,21 +1276,21 @@ func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) { // handleRcvdSegment is called when a segment is received; it is responsible for // updating the send-related state. -func (s *sender) handleRcvdSegment(seg *segment) { +func (s *sender) handleRcvdSegment(rcvdSeg *segment) { // Check if we can extract an RTT measurement from this ack. - if !seg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(seg.ackNumber) { + if !rcvdSeg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(rcvdSeg.ackNumber) { s.updateRTO(time.Now().Sub(s.rttMeasureTime)) s.rttMeasureSeqNum = s.sndNxt } // Update Timestamp if required. See RFC7323, section-4.3. - if s.ep.sendTSOk && seg.parsedOptions.TS { - s.ep.updateRecentTimestamp(seg.parsedOptions.TSVal, s.maxSentAck, seg.sequenceNumber) + if s.ep.sendTSOk && rcvdSeg.parsedOptions.TS { + s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.maxSentAck, rcvdSeg.sequenceNumber) } // Insert SACKBlock information into our scoreboard. if s.ep.sackPermitted { - for _, sb := range seg.parsedOptions.SACKBlocks { + for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { // Only insert the SACK block if the following holds // true: // * SACK block acks data after the ack number in the @@ -1299,27 +1303,27 @@ func (s *sender) handleRcvdSegment(seg *segment) { // NOTE: This check specifically excludes DSACK blocks // which have start/end before sndUna and are used to // indicate spurious retransmissions. - if seg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) { + if rcvdSeg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) { s.ep.scoreboard.Insert(sb) - seg.hasNewSACKInfo = true + rcvdSeg.hasNewSACKInfo = true } } s.SetPipe() } // Count the duplicates and do the fast retransmit if needed. - rtx := s.checkDuplicateAck(seg) + rtx := s.checkDuplicateAck(rcvdSeg) // Stash away the current window size. - s.sndWnd = seg.window + s.sndWnd = rcvdSeg.window - ack := seg.ackNumber + ack := rcvdSeg.ackNumber // Disable zero window probing if remote advertizes a non-zero receive // window. This can be with an ACK to the zero window probe (where the // acknumber refers to the already acknowledged byte) OR to any previously // unacknowledged segment. - if s.zeroWindowProbing && seg.window > 0 && + if s.zeroWindowProbing && rcvdSeg.window > 0 && (ack == s.sndUna || (ack-1).InRange(s.sndUna, s.sndNxt)) { s.disableZeroWindowProbing() } @@ -1344,10 +1348,10 @@ func (s *sender) handleRcvdSegment(seg *segment) { // averaged RTT measurement only if the segment acknowledges // some new data, i.e., only if it advances the left edge of // the send window. - if s.ep.sendTSOk && seg.parsedOptions.TSEcr != 0 { + if s.ep.sendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 { // TSVal/Ecr values sent by Netstack are at a millisecond // granularity. - elapsed := time.Duration(s.ep.timestamp()-seg.parsedOptions.TSEcr) * time.Millisecond + elapsed := time.Duration(s.ep.timestamp()-rcvdSeg.parsedOptions.TSEcr) * time.Millisecond s.updateRTO(elapsed) } @@ -1361,6 +1365,9 @@ func (s *sender) handleRcvdSegment(seg *segment) { ackLeft := acked originalOutstanding := s.outstanding + s.rtt.Lock() + srtt := s.rtt.srtt + s.rtt.Unlock() for ackLeft > 0 { // We use logicalLen here because we can have FIN // segments (which are always at the end of list) that @@ -1380,6 +1387,11 @@ func (s *sender) handleRcvdSegment(seg *segment) { s.writeNext = seg.Next() } + // Update the RACK fields if SACK is enabled. + if s.ep.sackPermitted { + s.rc.Update(seg, rcvdSeg, srtt, s.ep.tsOffset) + } + s.writeList.Remove(seg) // if SACK is enabled then Only reduce outstanding if @@ -1435,7 +1447,7 @@ func (s *sender) handleRcvdSegment(seg *segment) { // that the window opened up, or the congestion window was inflated due // to a duplicate ack during fast recovery. This will also re-enable // the retransmit timer if needed. - if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || seg.hasNewSACKInfo { + if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || rcvdSeg.hasNewSACKInfo { s.sendData() } } diff --git a/pkg/tcpip/transport/tcp/tcp_rack_test.go b/pkg/tcpip/transport/tcp/tcp_rack_test.go new file mode 100644 index 000000000..e03f101e8 --- /dev/null +++ b/pkg/tcpip/transport/tcp/tcp_rack_test.go @@ -0,0 +1,74 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_test + +import ( + "testing" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context" +) + +// TestRACKUpdate tests the RACK related fields are updated when an ACK is +// received on a SACK enabled connection. +func TestRACKUpdate(t *testing.T) { + const maxPayload = 10 + const tsOptionSize = 12 + const maxTCPOptionSize = 40 + + c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxTCPOptionSize+maxPayload)) + defer c.Cleanup() + + var xmitTime time.Time + c.Stack().AddTCPProbe(func(state stack.TCPEndpointState) { + // Validate that the endpoint Sender.RACKState is what we expect. + if state.Sender.RACKState.XmitTime.Before(xmitTime) { + t.Fatalf("RACK transmit time failed to update when an ACK is received") + } + + gotSeq := state.Sender.RACKState.EndSequence + wantSeq := state.Sender.SndNxt + if !gotSeq.LessThanEq(wantSeq) || gotSeq.LessThan(wantSeq) { + t.Fatalf("RACK sequence number failed to update, got: %v, but want: %v", gotSeq, wantSeq) + } + + if state.Sender.RACKState.RTT == 0 { + t.Fatalf("RACK RTT failed to update when an ACK is received") + } + }) + setStackSACKPermitted(t, c, true) + createConnectedWithSACKAndTS(c) + + data := buffer.NewView(maxPayload) + for i := range data { + data[i] = byte(i) + } + + // Write the data. + xmitTime = time.Now() + if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil { + t.Fatalf("Write failed: %s", err) + } + + bytesRead := 0 + c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, tsOptionSize) + bytesRead += maxPayload + c.SendAck(790, bytesRead) + time.Sleep(200 * time.Millisecond) +} diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index e67ec42b1..1b58eb91b 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -146,6 +146,24 @@ func TestActiveFailedConnectionAttemptIncrement(t *testing.T) { } } +func TestCloseWithoutConnect(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + // Create TCP endpoint. + var err *tcpip.Error + c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ) + if err != nil { + t.Fatalf("NewEndpoint failed: %s", err) + } + + c.EP.Close() + + if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 { + t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got) + } +} + func TestTCPSegmentsSentIncrement(t *testing.T) { c := context.New(t, defaultMTU) defer c.Cleanup() @@ -1276,68 +1294,91 @@ func TestConnectBindToDevice(t *testing.T) { } } -func TestRstOnSynSent(t *testing.T) { - c := context.New(t, defaultMTU) - defer c.Cleanup() +func TestSynSent(t *testing.T) { + for _, test := range []struct { + name string + reset bool + }{ + {"RstOnSynSent", true}, + {"CloseOnSynSent", false}, + } { + t.Run(test.name, func(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() - // Create an endpoint, don't handshake because we want to interfere with the - // handshake process. - c.Create(-1) + // Create an endpoint, don't handshake because we want to interfere with the + // handshake process. + c.Create(-1) - // Start connection attempt. - waitEntry, ch := waiter.NewChannelEntry(nil) - c.WQ.EventRegister(&waitEntry, waiter.EventOut) - defer c.WQ.EventUnregister(&waitEntry) + // Start connection attempt. + waitEntry, ch := waiter.NewChannelEntry(nil) + c.WQ.EventRegister(&waitEntry, waiter.EventOut) + defer c.WQ.EventUnregister(&waitEntry) - addr := tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort} - if err := c.EP.Connect(addr); err != tcpip.ErrConnectStarted { - t.Fatalf("got Connect(%+v) = %s, want %s", addr, err, tcpip.ErrConnectStarted) - } + addr := tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort} + if err := c.EP.Connect(addr); err != tcpip.ErrConnectStarted { + t.Fatalf("got Connect(%+v) = %s, want %s", addr, err, tcpip.ErrConnectStarted) + } - // Receive SYN packet. - b := c.GetPacket() - checker.IPv4(t, b, - checker.TCP( - checker.DstPort(context.TestPort), - checker.TCPFlags(header.TCPFlagSyn), - ), - ) + // Receive SYN packet. + b := c.GetPacket() + checker.IPv4(t, b, + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagSyn), + ), + ) - // Ensure that we've reached SynSent state - if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want { - t.Fatalf("got State() = %s, want %s", got, want) - } - tcpHdr := header.TCP(header.IPv4(b).Payload()) - c.IRS = seqnum.Value(tcpHdr.SequenceNumber()) + if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want { + t.Fatalf("got State() = %s, want %s", got, want) + } + tcpHdr := header.TCP(header.IPv4(b).Payload()) + c.IRS = seqnum.Value(tcpHdr.SequenceNumber()) - // Send a packet with a proper ACK and a RST flag to cause the socket - // to Error and close out - iss := seqnum.Value(789) - rcvWnd := seqnum.Size(30000) - c.SendPacket(nil, &context.Headers{ - SrcPort: tcpHdr.DestinationPort(), - DstPort: tcpHdr.SourcePort(), - Flags: header.TCPFlagRst | header.TCPFlagAck, - SeqNum: iss, - AckNum: c.IRS.Add(1), - RcvWnd: rcvWnd, - TCPOpts: nil, - }) + if test.reset { + // Send a packet with a proper ACK and a RST flag to cause the socket + // to error and close out. + iss := seqnum.Value(789) + rcvWnd := seqnum.Size(30000) + c.SendPacket(nil, &context.Headers{ + SrcPort: tcpHdr.DestinationPort(), + DstPort: tcpHdr.SourcePort(), + Flags: header.TCPFlagRst | header.TCPFlagAck, + SeqNum: iss, + AckNum: c.IRS.Add(1), + RcvWnd: rcvWnd, + TCPOpts: nil, + }) + } else { + c.EP.Close() + } - // Wait for receive to be notified. - select { - case <-ch: - case <-time.After(3 * time.Second): - t.Fatal("timed out waiting for packet to arrive") - } + // Wait for receive to be notified. + select { + case <-ch: + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for packet to arrive") + } - if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionRefused { - t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionRefused) - } + if test.reset { + if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionRefused { + t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionRefused) + } + } else { + if _, _, err := c.EP.Read(nil); err != tcpip.ErrAborted { + t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrAborted) + } + } - // Due to the RST the endpoint should be in an error state. - if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want { - t.Fatalf("got State() = %s, want %s", got, want) + if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 { + t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got) + } + + // Due to the RST the endpoint should be in an error state. + if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want { + t.Fatalf("got State() = %s, want %s", got, want) + } + }) } } @@ -6071,7 +6112,7 @@ func TestTCPLingerTimeout(t *testing.T) { {"InRangeLingerTimeout", 10 * time.Second, 10 * time.Second}, // Values > stack's TCPLingerTimeout are capped to the stack's // value. Defaults to tcp.DefaultTCPLingerTimeout(60 seconds) - {"AboveMaxLingerTimeout", 65 * time.Second, 60 * time.Second}, + {"AboveMaxLingerTimeout", 125 * time.Second, 120 * time.Second}, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 6e692da07..b7d735889 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -483,10 +483,6 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c nicID = e.BindNICID } - if to.Addr == header.IPv4Broadcast && !e.broadcast { - return 0, nil, tcpip.ErrBroadcastDisabled - } - dst, netProto, err := e.checkV4MappedLocked(*to) if err != nil { return 0, nil, err @@ -503,6 +499,10 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c resolve = route.Resolve } + if !e.broadcast && route.IsBroadcast() { + return 0, nil, tcpip.ErrBroadcastDisabled + } + if route.IsResolutionRequired() { if ch, err := resolve(nil); err != nil { if err == tcpip.ErrWouldBlock { diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go index 90781cf49..66e8911c8 100644 --- a/pkg/tcpip/transport/udp/udp_test.go +++ b/pkg/tcpip/transport/udp/udp_test.go @@ -2142,3 +2142,192 @@ func (c *testContext) checkEndpointReadStats(incr uint64, want tcpip.TransportEn c.t.Errorf("Endpoint stats not matching for error %s got %+v want %+v", err, got, want) } } + +func TestOutgoingSubnetBroadcast(t *testing.T) { + const nicID1 = 1 + + ipv4Addr := tcpip.AddressWithPrefix{ + Address: "\xc0\xa8\x01\x3a", + PrefixLen: 24, + } + ipv4Subnet := ipv4Addr.Subnet() + ipv4SubnetBcast := ipv4Subnet.Broadcast() + ipv4Gateway := tcpip.Address("\xc0\xa8\x01\x01") + ipv4AddrPrefix31 := tcpip.AddressWithPrefix{ + Address: "\xc0\xa8\x01\x3a", + PrefixLen: 31, + } + ipv4Subnet31 := ipv4AddrPrefix31.Subnet() + ipv4Subnet31Bcast := ipv4Subnet31.Broadcast() + ipv4AddrPrefix32 := tcpip.AddressWithPrefix{ + Address: "\xc0\xa8\x01\x3a", + PrefixLen: 32, + } + ipv4Subnet32 := ipv4AddrPrefix32.Subnet() + ipv4Subnet32Bcast := ipv4Subnet32.Broadcast() + ipv6Addr := tcpip.AddressWithPrefix{ + Address: "\x20\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01", + PrefixLen: 64, + } + ipv6Subnet := ipv6Addr.Subnet() + ipv6SubnetBcast := ipv6Subnet.Broadcast() + remNetAddr := tcpip.AddressWithPrefix{ + Address: "\x64\x0a\x7b\x18", + PrefixLen: 24, + } + remNetSubnet := remNetAddr.Subnet() + remNetSubnetBcast := remNetSubnet.Broadcast() + + tests := []struct { + name string + nicAddr tcpip.ProtocolAddress + routes []tcpip.Route + remoteAddr tcpip.Address + requiresBroadcastOpt bool + }{ + { + name: "IPv4 Broadcast to local subnet", + nicAddr: tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: ipv4Addr, + }, + routes: []tcpip.Route{ + { + Destination: ipv4Subnet, + NIC: nicID1, + }, + }, + remoteAddr: ipv4SubnetBcast, + requiresBroadcastOpt: true, + }, + { + name: "IPv4 Broadcast to local /31 subnet", + nicAddr: tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: ipv4AddrPrefix31, + }, + routes: []tcpip.Route{ + { + Destination: ipv4Subnet31, + NIC: nicID1, + }, + }, + remoteAddr: ipv4Subnet31Bcast, + requiresBroadcastOpt: false, + }, + { + name: "IPv4 Broadcast to local /32 subnet", + nicAddr: tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: ipv4AddrPrefix32, + }, + routes: []tcpip.Route{ + { + Destination: ipv4Subnet32, + NIC: nicID1, + }, + }, + remoteAddr: ipv4Subnet32Bcast, + requiresBroadcastOpt: false, + }, + // IPv6 has no notion of a broadcast. + { + name: "IPv6 'Broadcast' to local subnet", + nicAddr: tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: ipv6Addr, + }, + routes: []tcpip.Route{ + { + Destination: ipv6Subnet, + NIC: nicID1, + }, + }, + remoteAddr: ipv6SubnetBcast, + requiresBroadcastOpt: false, + }, + { + name: "IPv4 Broadcast to remote subnet", + nicAddr: tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: ipv4Addr, + }, + routes: []tcpip.Route{ + { + Destination: remNetSubnet, + Gateway: ipv4Gateway, + NIC: nicID1, + }, + }, + remoteAddr: remNetSubnetBcast, + requiresBroadcastOpt: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()}, + + TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}, + }) + e := channel.New(0, defaultMTU, "") + if err := s.CreateNIC(nicID1, e); err != nil { + t.Fatalf("CreateNIC(%d, _): %s", nicID1, err) + } + if err := s.AddProtocolAddress(nicID1, test.nicAddr); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID1, test.nicAddr, err) + } + + s.SetRouteTable(test.routes) + + var netProto tcpip.NetworkProtocolNumber + switch l := len(test.remoteAddr); l { + case header.IPv4AddressSize: + netProto = header.IPv4ProtocolNumber + case header.IPv6AddressSize: + netProto = header.IPv6ProtocolNumber + default: + t.Fatalf("got unexpected address length = %d bytes", l) + } + + wq := waiter.Queue{} + ep, err := s.NewEndpoint(udp.ProtocolNumber, netProto, &wq) + if err != nil { + t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, netProto, err) + } + defer ep.Close() + + data := tcpip.SlicePayload([]byte{1, 2, 3, 4}) + to := tcpip.FullAddress{ + Addr: test.remoteAddr, + Port: 80, + } + opts := tcpip.WriteOptions{To: &to} + expectedErrWithoutBcastOpt := tcpip.ErrBroadcastDisabled + if !test.requiresBroadcastOpt { + expectedErrWithoutBcastOpt = nil + } + + if n, _, err := ep.Write(data, opts); err != expectedErrWithoutBcastOpt { + t.Fatalf("got ep.Write(_, _) = (%d, _, %v), want = (_, _, %v)", n, err, expectedErrWithoutBcastOpt) + } + + if err := ep.SetSockOptBool(tcpip.BroadcastOption, true); err != nil { + t.Fatalf("got SetSockOptBool(BroadcastOption, true): %s", err) + } + + if n, _, err := ep.Write(data, opts); err != nil { + t.Fatalf("got ep.Write(_, _) = (%d, _, %s), want = (_, _, nil)", n, err) + } + + if err := ep.SetSockOptBool(tcpip.BroadcastOption, false); err != nil { + t.Fatalf("got SetSockOptBool(BroadcastOption, false): %s", err) + } + + if n, _, err := ep.Write(data, opts); err != expectedErrWithoutBcastOpt { + t.Fatalf("got ep.Write(_, _) = (%d, _, %v), want = (_, _, %v)", n, err, expectedErrWithoutBcastOpt) + } + }) + } +} diff --git a/pkg/test/dockerutil/container.go b/pkg/test/dockerutil/container.go index 441173ec2..052b6b99d 100644 --- a/pkg/test/dockerutil/container.go +++ b/pkg/test/dockerutil/container.go @@ -58,12 +58,6 @@ type Container struct { // a handle to restart the profile. Generally, tests/benchmarks using // profiles need to run as root. profiles []Profile - - // Stores streams attached to the container. Used by WaitForOutputSubmatch. - streams types.HijackedResponse - - // stores previously read data from the attached streams. - streamBuf bytes.Buffer } // RunOpts are options for running a container. @@ -175,11 +169,25 @@ func (c *Container) SpawnProcess(ctx context.Context, r RunOpts, args ...string) return Process{}, err } + // Open a connection to the container for parsing logs and for TTY. + stream, err := c.client.ContainerAttach(ctx, c.id, + types.ContainerAttachOptions{ + Stream: true, + Stdin: true, + Stdout: true, + Stderr: true, + }) + if err != nil { + return Process{}, fmt.Errorf("connect failed container id %s: %v", c.id, err) + } + + c.cleanups = append(c.cleanups, func() { stream.Close() }) + if err := c.Start(ctx); err != nil { return Process{}, err } - return Process{container: c, conn: c.streams}, nil + return Process{container: c, conn: stream}, nil } // Run is analogous to 'docker run'. @@ -273,23 +281,6 @@ func (c *Container) hostConfig(r RunOpts) *container.HostConfig { // Start is analogous to 'docker start'. func (c *Container) Start(ctx context.Context) error { - - // Open a connection to the container for parsing logs and for TTY. - streams, err := c.client.ContainerAttach(ctx, c.id, - types.ContainerAttachOptions{ - Stream: true, - Stdin: true, - Stdout: true, - Stderr: true, - }) - if err != nil { - return fmt.Errorf("failed to connect to container: %v", err) - } - - c.streams = streams - c.cleanups = append(c.cleanups, func() { - c.streams.Close() - }) if err := c.client.ContainerStart(ctx, c.id, types.ContainerStartOptions{}); err != nil { return fmt.Errorf("ContainerStart failed: %v", err) } @@ -454,15 +445,19 @@ func (c *Container) Wait(ctx context.Context) error { // WaitTimeout waits for the container to exit with a timeout. func (c *Container) WaitTimeout(ctx context.Context, timeout time.Duration) error { - timeoutChan := time.After(timeout) + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() statusChan, errChan := c.client.ContainerWait(ctx, c.id, container.WaitConditionNotRunning) select { + case <-ctx.Done(): + if ctx.Err() == context.DeadlineExceeded { + return fmt.Errorf("container %s timed out after %v seconds", c.Name, timeout.Seconds()) + } + return nil case err := <-errChan: return err case <-statusChan: return nil - case <-timeoutChan: - return fmt.Errorf("container %s timed out after %v seconds", c.Name, timeout.Seconds()) } } @@ -481,28 +476,19 @@ func (c *Container) WaitForOutput(ctx context.Context, pattern string, timeout t // WaitForOutputSubmatch searches container logs for the given // pattern or times out. It returns any regexp submatches as well. func (c *Container) WaitForOutputSubmatch(ctx context.Context, pattern string, timeout time.Duration) ([]string, error) { + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() re := regexp.MustCompile(pattern) - if matches := re.FindStringSubmatch(c.streamBuf.String()); matches != nil { - return matches, nil - } - - for exp := time.Now().Add(timeout); time.Now().Before(exp); { - c.streams.Conn.SetDeadline(time.Now().Add(50 * time.Millisecond)) - _, err := stdcopy.StdCopy(&c.streamBuf, &c.streamBuf, c.streams.Reader) - + for { + logs, err := c.Logs(ctx) if err != nil { - // check that it wasn't a timeout - if nerr, ok := err.(net.Error); !ok || !nerr.Timeout() { - return nil, err - } + return nil, fmt.Errorf("failed to get logs: %v logs: %s", err, logs) } - - if matches := re.FindStringSubmatch(c.streamBuf.String()); matches != nil { + if matches := re.FindStringSubmatch(logs); matches != nil { return matches, nil } + time.Sleep(50 * time.Millisecond) } - - return nil, fmt.Errorf("timeout waiting for output %q: out: %s", re.String(), c.streamBuf.String()) } // Kill kills the container. @@ -527,8 +513,18 @@ func (c *Container) CleanUp(ctx context.Context) { for _, profile := range c.profiles { profile.OnCleanUp(c) } + // Forget profiles. c.profiles = nil + + // Execute all cleanups. We execute cleanups here to close any + // open connections to the container before closing. Open connections + // can cause Kill and Remove to hang. + for _, c := range c.cleanups { + c() + } + c.cleanups = nil + // Kill the container. if err := c.Kill(ctx); err != nil && !strings.Contains(err.Error(), "is not running") { // Just log; can't do anything here. @@ -540,9 +536,4 @@ func (c *Container) CleanUp(ctx context.Context) { } // Forget all mounts. c.mounts = nil - // Execute all cleanups. - for _, c := range c.cleanups { - c() - } - c.cleanups = nil } diff --git a/pkg/test/dockerutil/profile.go b/pkg/test/dockerutil/profile.go index 1fab33083..f0396ef24 100644 --- a/pkg/test/dockerutil/profile.go +++ b/pkg/test/dockerutil/profile.go @@ -49,17 +49,16 @@ type Profile interface { // should have --profile set as an option in /etc/docker/daemon.json in // order for profiling to work with Pprof. type Pprof struct { - BasePath string // path to put profiles - BlockProfile bool - CPUProfile bool - GoRoutineProfile bool - HeapProfile bool - MutexProfile bool - Duration time.Duration // duration to run profiler e.g. '10s' or '1m'. - shouldRun bool - cmd *exec.Cmd - stdout io.ReadCloser - stderr io.ReadCloser + BasePath string // path to put profiles + BlockProfile bool + CPUProfile bool + HeapProfile bool + MutexProfile bool + Duration time.Duration // duration to run profiler e.g. '10s' or '1m'. + shouldRun bool + cmd *exec.Cmd + stdout io.ReadCloser + stderr io.ReadCloser } // MakePprofFromFlags makes a Pprof profile from flags. @@ -68,13 +67,12 @@ func MakePprofFromFlags(c *Container) *Pprof { return nil } return &Pprof{ - BasePath: filepath.Join(*pprofBaseDir, c.runtime, c.Name), - BlockProfile: *pprofBlock, - CPUProfile: *pprofCPU, - GoRoutineProfile: *pprofGo, - HeapProfile: *pprofHeap, - MutexProfile: *pprofMutex, - Duration: *duration, + BasePath: filepath.Join(*pprofBaseDir, c.runtime, c.Name), + BlockProfile: *pprofBlock, + CPUProfile: *pprofCPU, + HeapProfile: *pprofHeap, + MutexProfile: *pprofMutex, + Duration: *duration, } } @@ -138,9 +136,6 @@ func (p *Pprof) makeProfileArgs(c *Container) []string { if p.CPUProfile { ret = append(ret, fmt.Sprintf("--profile-cpu=%s", filepath.Join(p.BasePath, "cpu.pprof"))) } - if p.GoRoutineProfile { - ret = append(ret, fmt.Sprintf("--profile-goroutine=%s", filepath.Join(p.BasePath, "go.pprof"))) - } if p.HeapProfile { ret = append(ret, fmt.Sprintf("--profile-heap=%s", filepath.Join(p.BasePath, "heap.pprof"))) } diff --git a/pkg/test/dockerutil/profile_test.go b/pkg/test/dockerutil/profile_test.go index b7b4d7618..8c4ffe483 100644 --- a/pkg/test/dockerutil/profile_test.go +++ b/pkg/test/dockerutil/profile_test.go @@ -51,13 +51,12 @@ func TestPprof(t *testing.T) { { name: "All", pprof: Pprof{ - BasePath: basePath, - BlockProfile: true, - CPUProfile: true, - GoRoutineProfile: true, - HeapProfile: true, - MutexProfile: true, - Duration: 2 * time.Second, + BasePath: basePath, + BlockProfile: true, + CPUProfile: true, + HeapProfile: true, + MutexProfile: true, + Duration: 2 * time.Second, }, expectedFiles: []string{block, cpu, goprofle, heap, mutex}, }, diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 3e5e4c22f..626a3816e 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -101,14 +101,13 @@ const ( // Profiling related commands (see pprof.go for more details). const ( - StartCPUProfile = "Profile.StartCPUProfile" - StopCPUProfile = "Profile.StopCPUProfile" - HeapProfile = "Profile.HeapProfile" - GoroutineProfile = "Profile.GoroutineProfile" - BlockProfile = "Profile.BlockProfile" - MutexProfile = "Profile.MutexProfile" - StartTrace = "Profile.StartTrace" - StopTrace = "Profile.StopTrace" + StartCPUProfile = "Profile.StartCPUProfile" + StopCPUProfile = "Profile.StopCPUProfile" + HeapProfile = "Profile.HeapProfile" + BlockProfile = "Profile.BlockProfile" + MutexProfile = "Profile.MutexProfile" + StartTrace = "Profile.StartTrace" + StopTrace = "Profile.StopTrace" ) // Logging related commands (see logging.go for more details). @@ -129,42 +128,52 @@ type controller struct { // manager holds the containerManager methods. manager *containerManager + + // pprop holds the profile instance if enabled. It may be nil. + pprof *control.Profile } // newController creates a new controller. The caller must call // controller.srv.StartServing() to start the controller. func newController(fd int, l *Loader) (*controller, error) { - srv, err := server.CreateFromFD(fd) + ctrl := &controller{} + var err error + ctrl.srv, err = server.CreateFromFD(fd) if err != nil { return nil, err } - manager := &containerManager{ + ctrl.manager = &containerManager{ startChan: make(chan struct{}), startResultChan: make(chan error), l: l, } - srv.Register(manager) + ctrl.srv.Register(ctrl.manager) if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { net := &Network{ Stack: eps.Stack, } - srv.Register(net) + ctrl.srv.Register(net) } - srv.Register(&debug{}) - srv.Register(&control.Logging{}) + ctrl.srv.Register(&debug{}) + ctrl.srv.Register(&control.Logging{}) + if l.root.conf.ProfileEnable { - srv.Register(&control.Profile{ - Kernel: l.k, - }) + ctrl.pprof = &control.Profile{Kernel: l.k} + ctrl.srv.Register(ctrl.pprof) } - return &controller{ - srv: srv, - manager: manager, - }, nil + return ctrl, nil +} + +func (c *controller) stop() { + if c.pprof != nil { + // These are noop if there is nothing being profiled. + _ = c.pprof.StopCPUProfile(nil, nil) + _ = c.pprof.StopTrace(nil, nil) + } } // containerManager manages sandbox containers. diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 59639ba19..9dd5b0184 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -640,7 +640,7 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, m := range c.mounts { log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) @@ -868,7 +868,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns if err != nil { return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) } - defer dirent.DecRef() + defer dirent.DecRef(ctx) if err := mns.Mount(ctx, dirent, inode); err != nil { return fmt.Errorf("mount %q error: %v", m.Destination, err) } @@ -889,12 +889,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun if err != nil { return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) } - defer target.DecRef() + defer target.DecRef(ctx) // Take a ref on the inode that is about to be (re)-mounted. source.root.IncRef() if err := mns.Mount(ctx, target, source.root); err != nil { - source.root.DecRef() + source.root.DecRef(ctx) return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) } @@ -997,12 +997,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M switch err { case nil: // Found '/tmp' in filesystem, check if it's empty. - defer tmp.DecRef() + defer tmp.DecRef(ctx) f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true}) if err != nil { return err } - defer f.DecRef() + defer f.DecRef(ctx) serializer := &fs.CollectEntriesSerializer{} if err := f.Readdir(ctx, serializer); err != nil { return err diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 9cd9c5909..533b9c5e7 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -16,12 +16,12 @@ package boot import ( + "errors" "fmt" mrand "math/rand" "os" "runtime" "sync/atomic" - "syscall" gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" @@ -187,7 +187,7 @@ type Args struct { } // make sure stdioFDs are always the same on initial start and on restore -const startingStdioFD = 64 +const startingStdioFD = 256 // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. @@ -346,7 +346,7 @@ func New(args Args) (*Loader, error) { if err != nil { return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err) } - defer hostFilesystem.DecRef() + defer hostFilesystem.DecRef(k.SupervisorContext()) hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) if err != nil { return nil, fmt.Errorf("failed to create hostfs mount: %v", err) @@ -360,15 +360,20 @@ func New(args Args) (*Loader, error) { var stdioFDs []int newfd := startingStdioFD for _, fd := range args.StdioFDs { - err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC) + // Check that newfd is unused to avoid clobbering over it. + if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { + if err != nil { + return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) + } + return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) + } + + err := unix.Dup3(fd, newfd, unix.O_CLOEXEC) if err != nil { return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err) } stdioFDs = append(stdioFDs, newfd) - err = syscall.Close(fd) - if err != nil { - return nil, fmt.Errorf("close original stdioFDs failed: %v", err) - } + _ = unix.Close(fd) newfd++ } @@ -458,6 +463,11 @@ func (l *Loader) Destroy() { l.stopSignalForwarding() } l.watchdog.Stop() + + for i, fd := range l.root.stdioFDs { + _ = unix.Close(fd) + l.root.stdioFDs[i] = -1 + } } func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) { @@ -591,11 +601,9 @@ func (l *Loader) run() error { // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the // passed FDs, so only close for VFS1. if !kernel.VFS2Enabled { - for _, fd := range l.root.stdioFDs { - err := syscall.Close(fd) - if err != nil { - return fmt.Errorf("close dup()ed stdioFDs: %v", err) - } + for i, fd := range l.root.stdioFDs { + _ = unix.Close(fd) + l.root.stdioFDs[i] = -1 } } @@ -686,7 +694,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file // Can't take ownership away from os.File. dup them to get a new FDs. for _, f := range files[3:] { - fd, err := syscall.Dup(int(f.Fd())) + fd, err := unix.Dup(int(f.Fd())) if err != nil { return fmt.Errorf("failed to dup file: %v", err) } @@ -755,7 +763,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn return nil, fmt.Errorf("creating process: %v", err) } // CreateProcess takes a reference on FDTable if successful. - info.procArgs.FDTable.DecRef() + info.procArgs.FDTable.DecRef(ctx) // Set the foreground process group on the TTY to the global init process // group, since that is what we are about to start running. @@ -890,22 +898,20 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { // Add the HOME environment variable if it is not already set. if kernel.VFS2Enabled { - defer args.MountNamespaceVFS2.DecRef() - root := args.MountNamespaceVFS2.Root() - defer root.DecRef() ctx := vfs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespaceVFS2.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) if err != nil { return 0, err } args.Envv = envv } else { - defer args.MountNamespace.DecRef() - root := args.MountNamespace.Root() - defer root.DecRef() ctx := fs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespace.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) if err != nil { return 0, err @@ -1002,6 +1008,9 @@ func (l *Loader) WaitExit() kernel.ExitStatus { // Wait for container. l.k.WaitExited() + // Cleanup + l.ctrl.stop() + return l.k.GlobalInit().ExitStatus() } @@ -1263,7 +1272,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F fdTable := k.NewFDTable() ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) if err != nil { - fdTable.DecRef() + fdTable.DecRef(ctx) return nil, nil, nil, err } return fdTable, ttyFile, ttyFileVFS2, nil diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 8e6fe57e1..aa3fdf96c 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -450,13 +450,13 @@ func TestCreateMountNamespace(t *testing.T) { } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { maxTraversals := uint(0) if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) @@ -491,7 +491,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { target := &vfs.PathOperation{ Root: root, @@ -502,7 +502,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 9a1ed8e9e..e7d6035bb 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -37,6 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -89,6 +90,12 @@ func registerFilesystems(k *kernel.Kernel) error { if err := ttydev.Register(vfsObj); err != nil { return fmt.Errorf("registering ttydev: %w", err) } + tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx)) + if tunSupported { + if err := tundev.Register(vfsObj); err != nil { + return fmt.Errorf("registering tundev: %v", err) + } + } if kernel.FUSEEnabled { if err := fuse.Register(vfsObj); err != nil { @@ -96,14 +103,11 @@ func registerFilesystems(k *kernel.Kernel) error { } } - if err := tundev.Register(vfsObj); err != nil { - return fmt.Errorf("registering tundev: %v", err) - } a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name) if err != nil { return fmt.Errorf("creating devtmpfs accessor: %w", err) } - defer a.Release() + defer a.Release(ctx) if err := a.UserspaceInit(ctx); err != nil { return fmt.Errorf("initializing userspace: %w", err) @@ -114,8 +118,10 @@ func registerFilesystems(k *kernel.Kernel) error { if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil { return fmt.Errorf("creating ttydev devtmpfs files: %w", err) } - if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { - return fmt.Errorf("creating tundev devtmpfs files: %v", err) + if tunSupported { + if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating tundev devtmpfs files: %v", err) + } } if kernel.FUSEEnabled { @@ -171,10 +177,19 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { fd := c.fds.remove() - opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",") + opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */) + + if conf.OverlayfsStaleRead { + // We can't check for overlayfs here because sandbox is chroot'ed and gofer + // can only send mount options for specs.Mounts (specs.Root is missing + // Options field). So assume root is always on top of overlayfs. + opts = append(opts, "overlayfs_stale_read") + } log.Infof("Mounting root over 9P, ioFD: %d", fd) - mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts}) + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{ + Data: strings.Join(opts, ","), + }) if err != nil { return nil, fmt.Errorf("setting up mount namespace: %w", err) } @@ -243,7 +258,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, Start: root, @@ -378,7 +393,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, @@ -472,10 +487,10 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Co if err != nil { return err } - defer newMnt.DecRef() + defer newMnt.DecRef(ctx) root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil { return err } diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index b5de2588b..742f8c344 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -32,20 +32,19 @@ import ( // Debug implements subcommands.Command for the "debug" command. type Debug struct { - pid int - stacks bool - signal int - profileHeap string - profileCPU string - profileGoroutine string - profileBlock string - profileMutex string - trace string - strace string - logLevel string - logPackets string - duration time.Duration - ps bool + pid int + stacks bool + signal int + profileHeap string + profileCPU string + profileBlock string + profileMutex string + trace string + strace string + logLevel string + logPackets string + duration time.Duration + ps bool } // Name implements subcommands.Command. @@ -69,7 +68,6 @@ func (d *Debug) SetFlags(f *flag.FlagSet) { f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log") f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") - f.StringVar(&d.profileGoroutine, "profile-goroutine", "", "writes goroutine profile to the given file.") f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.") f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.") f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles") @@ -153,18 +151,6 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } log.Infof("Heap profile written to %q", d.profileHeap) } - if d.profileGoroutine != "" { - f, err := os.Create(d.profileGoroutine) - if err != nil { - return Errorf(err.Error()) - } - defer f.Close() - - if err := c.Sandbox.GoroutineProfile(f); err != nil { - return Errorf(err.Error()) - } - log.Infof("Goroutine profile written to %q", d.profileGoroutine) - } if d.profileBlock != "" { f, err := os.Create(d.profileBlock) if err != nil { diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index ebefeacf2..c6694c278 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -979,9 +979,12 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) { skip := uint64(0) - // Check if the file is at the correct position already. If not, seek to the - // beginning and read the entire directory again. - if l.lastDirentOffset != offset { + // Check if the file is at the correct position already. If not, seek to + // the beginning and read the entire directory again. We always seek if + // offset is 0, since this is side-effectual (equivalent to rewinddir(3), + // which causes the directory stream to resynchronize with the directory's + // current contents). + if l.lastDirentOffset != offset || offset == 0 { if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil { return nil, extractErrno(err) } diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 2afcc27af..36bb0c9c9 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -1012,26 +1012,6 @@ func (s *Sandbox) StopCPUProfile() error { return nil } -// GoroutineProfile writes a goroutine profile to the given file. -func (s *Sandbox) GoroutineProfile(f *os.File) error { - log.Debugf("Goroutine profile %q", s.ID) - conn, err := s.sandboxConnect() - if err != nil { - return err - } - defer conn.Close() - - opts := control.ProfileOpts{ - FilePayload: urpc.FilePayload{ - Files: []*os.File{f}, - }, - } - if err := conn.Call(boot.GoroutineProfile, &opts, nil); err != nil { - return fmt.Errorf("getting sandbox %q goroutine profile: %v", s.ID, err) - } - return nil -} - // BlockProfile writes a block profile to the given file. func (s *Sandbox) BlockProfile(f *os.File) error { log.Debugf("Block profile %q", s.ID) diff --git a/scripts/docker_tests.sh b/scripts/docker_tests.sh index 07e9f3109..be0b0a3ec 100755 --- a/scripts/docker_tests.sh +++ b/scripts/docker_tests.sh @@ -23,5 +23,5 @@ test_runsc //test/image:image_test //test/e2e:integration_test install_runsc_for_test docker --vfs2 IMAGE_FILTER="Hello|Httpd|Ruby|Stdio" -INTEGRATION_FILTER="LifeCycle|Pause|Connect|JobControl|Overlay|Exec|DirCreation/root" +INTEGRATION_FILTER="LifeCycle|Pause|Connect|JobControl|Overlay|Exec|DirCreation/root|Link" test_runsc //test/e2e:integration_test //test/image:image_test --test_filter="${IMAGE_FILTER}|${INTEGRATION_FILTER}" diff --git a/benchmarks/run.py b/scripts/fuse_tests.sh index a22eb8641..bbaaa99fc 100644..100755 --- a/benchmarks/run.py +++ b/scripts/fuse_tests.sh @@ -1,19 +1,20 @@ -# python3 -# Copyright 2019 Google LLC +#!/bin/bash + +# Copyright 2020 The gVisor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Benchmark runner.""" -from benchmarks import runner +source $(dirname $0)/common.sh -if __name__ == "__main__": - runner.runner() +# Run all vfs2_fuse system call tests. +test --test_tag_filters=fuse //test/fuse/... diff --git a/scripts/simple_tests.sh b/scripts/simple_tests.sh index 3a15050c2..585216aae 100755 --- a/scripts/simple_tests.sh +++ b/scripts/simple_tests.sh @@ -17,4 +17,4 @@ source $(dirname $0)/common.sh # Run all simple tests (locally). -test //pkg/... //runsc/... //tools/... //benchmarks/... //benchmarks/runner:runner_test +test //pkg/... //runsc/... //tools/... diff --git a/shim/README.md b/shim/README.md index c6824ebdc..75daf00ac 100644 --- a/shim/README.md +++ b/shim/README.md @@ -1,11 +1,10 @@ -# gVisor Containerd shims +# Shim Overview -There are various shims supported for differt versions of -[containerd][containerd]. +Integration with containerd is done via a [shim][shims]. There are various shims +supported for different versions of [containerd][containerd]. -- [Configure gvisor-containerd-shim (shim v1) (containerd ≤ 1.2)](v1/configure-gvisor-containerd-shim.md) -- [Runtime Handler/RuntimeClass Quick Start (containerd >= 1.2)](v2/runtime-handler-quickstart.md) -- [Runtime Handler/RuntimeClass Quick Start (shim v2) (containerd >= 1.2)](v2/runtime-handler-shim-v2-quickstart.md) -- [Configure containerd-shim-runsc-v1 (shim v2) (containerd >= 1.3)](v2/configure-containerd-shim-runsc-v1.md) +- [Containerd 1.2+ (shim v2)](https://gvisor.dev/docs/user_guide/containerd/quick_start/) +- [Containerd 1.1 (shim v1)](https://gvisor.dev/docs/user_guide/containerd/containerd_11/) [containerd]: https://github.com/containerd/containerd +[shims]: https://iximiuz.com/en/posts/implementing-container-runtime-shim/ diff --git a/shim/v1/BUILD b/shim/v1/BUILD index 7b837630c..4c9e2c2c6 100644 --- a/shim/v1/BUILD +++ b/shim/v1/BUILD @@ -1,5 +1,4 @@ load("//tools:defs.bzl", "go_binary") -load("//website:defs.bzl", "doc") package(licenses = ["notice"]) @@ -29,13 +28,3 @@ go_binary( "@org_golang_x_sys//unix:go_default_library", ], ) - -doc( - name = "doc", - src = "README.md", - category = "User Guide", - permalink = "/docs/user_guide/gvisor-containerd-shim/", - subcategory = "Advanced", - visibility = ["//website:__pkg__"], - weight = "93", -) diff --git a/shim/v1/README.md b/shim/v1/README.md deleted file mode 100644 index 7aa4513a1..000000000 --- a/shim/v1/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# gvisor-containerd-shim - -> Note: This shim version is supported only for containerd versions less than -> 1.2. If you are using a containerd version greater than or equal to 1.2, then -> please use `containerd-shim-runsc-v1` (Shim API v1). -> -> This containerd shim is supported only in a best-effort capacity. - -This document describes how to configure and use `gvisor-containerd-shim`. - -## Containerd Configuration - -To use this shim, you must configure `/etc/containerd/config.toml` as follows: - -``` -[plugins.linux] - shim = "/usr/bin/gvisor-containerd-shim" -[plugins.cri.containerd.runtimes.gvisor] - runtime_type = "io.containerd.runtime.v1.linux" - runtime_engine = "/usr/bin/runsc" - runtime_root = "/run/containerd/runsc" -``` - -In order to pick-up the new configuration, you may need to restart containerd: - -```shell -sudo systemctl restart containerd -``` - -## Shim Confguration - -The shim configuration is stored in `/etc/containerd/runsc.toml`. The -configuration file supports two values. - -* `runc_shim`: The path to the runc shim. This is used by - `gvisor-containerd-shim` to run standard containers. - -* `runsc_config`: This is a set of key/value pairs that are converted into - `runsc` command line flags. You can learn more about which flags are - available by running `runsc flags`. - -For example, a configuration might look as follows: - -``` -runc_shim = "/usr/local/bin/containerd-shim" -[runsc_config] -platform = "kvm" -debug = true -debug-log = /var/log/%ID%/gvisor/ -``` diff --git a/shim/v2/BUILD b/shim/v2/BUILD index ae4705935..8de9ac0ba 100644 --- a/shim/v2/BUILD +++ b/shim/v2/BUILD @@ -1,5 +1,4 @@ load("//tools:defs.bzl", "go_binary") -load("//website:defs.bzl", "doc") package(licenses = ["notice"]) @@ -17,13 +16,3 @@ go_binary( "@com_github_containerd_containerd//runtime/v2/shim:go_default_library", ], ) - -doc( - name = "doc", - src = "README.md", - category = "User Guide", - permalink = "/docs/user_guide/containerd-shim-runsc-v1/", - subcategory = "Advanced", - visibility = ["//website:__pkg__"], - weight = "92", -) diff --git a/shim/v2/README.md b/shim/v2/README.md deleted file mode 100644 index 2aa7c21e3..000000000 --- a/shim/v2/README.md +++ /dev/null @@ -1,91 +0,0 @@ -# containerd-shim-runsc-v1 - -> Note: This shim version is the recommended shim for containerd versions -> greater than or equal to 1.2. For older versions of containerd, use -> `gvisor-containerd-shim`. - -This document describes how to configure and use `containerd-shim-runsc-v1`. - -## Configuring Containerd 1.2 - -To configure containerd 1.2 to use this shim, add the runtime to -`/etc/containerd/config.toml` as follows: - -``` -[plugins.cri.containerd.runtimes.runsc] - runtime_type = "io.containerd.runsc.v1" - runtime_root = "/run/containerd/runsc" -[plugins.cri.containerd.runtimes.runsc.options] - TypeUrl = "io.containerd.runsc.v1.options" -``` - -The configuration will optionally loaded from a file named `config.toml` in the -`runtime_root` configured above. - -In order to pick up the new configuration, you may need to restart containerd: - -```shell -sudo systemctl restart containerd -``` - -## Configuring Containerd 1.3 and above - -To configure containerd 1.3 to use this shim, add the runtime to -`/etc/containerd/config.toml` as follows: - -``` -[plugins.cri.containerd.runtimes.runsc] - runtime_type = "io.containerd.runsc.v1" -[plugins.cri.containerd.runtimes.runsc.options] - TypeUrl = "io.containerd.runsc.v1.options" - ConfigPath = "/etc/containerd/runsc.toml" -``` - -The `ConfigPath` above will be used to provide a pointer to the configuration -file to be loaded. - -> Note that there will be configuration file loaded if `ConfigPath` is not set. - -In order to pick up the new configuration, you may need to restart containerd: - -```shell -sudo systemctl restart containerd -``` - -## Shim Confguration - -The shim configuration may carry the following options: - -* `shim_cgroup`: The cgroup to use for the shim itself. -* `io_uid`: The UID to use for pipes. -* `ui_gid`: The GID to use for pipes. -* `binary_name`: The runtime binary name (defaults to `runsc`). -* `root`: The root directory for the runtime. -* `runsc_config`: A dictionary of key-value pairs that will be passed to the - runtime as arguments. - -### Example: Enable the KVM platform - -gVisor enables the use of a number of platforms. This example shows how to -configure `containerd-shim-runsc-v1` to use gVisor with the KVM platform: - -```shell -cat <<EOF | sudo tee /etc/containerd/runsc.toml -[runsc_config] -platform = "kvm" -EOF -``` - -### Example: Enable gVisor debug logging - -gVisor debug logging can be enabled by setting the `debug` and `debug-log` flag. -The shim will replace "%ID%" with the container ID in the path of the -`debug-log` flag. - -```shell -cat <<EOF | sudo tee /etc/containerd/runsc.toml -[runsc_config] -debug = true -debug-log = /var/log/%ID%/gvisor.log -EOF -``` diff --git a/shim/v2/runtime-handler-shim-v2-quickstart.md b/shim/v2/runtime-handler-shim-v2-quickstart.md deleted file mode 100644 index 3b88ca74b..000000000 --- a/shim/v2/runtime-handler-shim-v2-quickstart.md +++ /dev/null @@ -1,251 +0,0 @@ -# Runtime Handler Quickstart (Shim V2) - -This document describes how to install and run `containerd-shim-runsc-v1` using -the containerd runtime handler support. This requires containerd 1.2 or later. - -## Requirements - -- **runsc**: See the [gVisor documentation](https://github.com/google/gvisor) - for information on how to install runsc. -- **containerd**: See the [containerd website](https://containerd.io/) for - information on how to install containerd. - -## Install - -### Install containerd-shim-runsc-v1 - -1. Build and install `containerd-shim-runsc-v1`. - -<!-- TODO: Use a release once we have one available. --> - -[embedmd]:# (../test/e2e/shim-install.sh shell /{ # Step 1\(dev\)/ /^}/) - -```shell -{ # Step 1(dev): Build and install gvisor-containerd-shim and containerd-shim-runsc-v1 - make - sudo make install -} -``` - -### Configure containerd - -1. Update `/etc/containerd/config.toml`. Make sure `containerd-shim-runsc-v1` - is in `${PATH}`. - -[embedmd]:# (../test/e2e/runtime-handler-shim-v2/install.sh shell /{ # Step 1/ /^}/) - -```shell -{ # Step 1: Create containerd config.toml -cat <<EOF | sudo tee /etc/containerd/config.toml -disabled_plugins = ["restart"] -[plugins.linux] - shim_debug = true -[plugins.cri.containerd.runtimes.runsc] - runtime_type = "io.containerd.runsc.v1" -EOF -} -``` - -1. Restart `containerd` - -```shell -sudo systemctl restart containerd -``` - -## Usage - -You can run containers in gVisor via containerd's CRI. - -### Install crictl - -1. Download and install the crictl binary: - -[embedmd]:# (../test/e2e/crictl-install.sh shell /{ # Step 1/ /^}/) - -```shell -{ # Step 1: Download crictl -wget https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.13.0/crictl-v1.13.0-linux-amd64.tar.gz -tar xf crictl-v1.13.0-linux-amd64.tar.gz -sudo mv crictl /usr/local/bin -} -``` - -1. Write the crictl configuration file - -[embedmd]:# (../test/e2e/crictl-install.sh shell /{ # Step 2/ /^}/) - -```shell -{ # Step 2: Configure crictl -cat <<EOF | sudo tee /etc/crictl.yaml -runtime-endpoint: unix:///run/containerd/containerd.sock -EOF -} -``` - -### Create the nginx Sandbox in gVisor - -1. Pull the nginx image - -[embedmd]:# (../test/e2e/runtime-handler/usage.sh shell /{ # Step 1/ /^}/) - -```shell -{ # Step 1: Pull the nginx image -sudo crictl pull nginx -} -``` - -1. Create the sandbox creation request - -[embedmd]:# (../test/e2e/runtime-handler/usage.sh shell /{ # Step 2/ /^EOF\n}/) - -```shell -{ # Step 2: Create sandbox.json -cat <<EOF | tee sandbox.json -{ - "metadata": { - "name": "nginx-sandbox", - "namespace": "default", - "attempt": 1, - "uid": "hdishd83djaidwnduwk28bcsb" - }, - "linux": { - }, - "log_directory": "/tmp" -} -EOF -} -``` - -1. Create the pod in gVisor - -[embedmd]:# (../test/e2e/runtime-handler/usage.sh shell /{ # Step 3/ /^}/) - -```shell -{ # Step 3: Create the sandbox -SANDBOX_ID=$(sudo crictl runp --runtime runsc sandbox.json) -} -``` - -### Run the nginx Container in the Sandbox - -1. Create the nginx container creation request - -[embedmd]:# (../test/e2e/run-container.sh shell /{ # Step 1/ /^EOF\n}/) - -```shell -{ # Step 1: Create nginx container config -cat <<EOF | tee container.json -{ - "metadata": { - "name": "nginx" - }, - "image":{ - "image": "nginx" - }, - "log_path":"nginx.0.log", - "linux": { - } -} -EOF -} -``` - -1. Create the nginx container - -[embedmd]:# (../test/e2e/run-container.sh shell /{ # Step 2/ /^}/) - -```shell -{ # Step 2: Create nginx container -CONTAINER_ID=$(sudo crictl create ${SANDBOX_ID} container.json sandbox.json) -} -``` - -1. Start the nginx container - -[embedmd]:# (../test/e2e/run-container.sh shell /{ # Step 3/ /^}/) - -```shell -{ # Step 3: Start nginx container -sudo crictl start ${CONTAINER_ID} -} -``` - -### Validate the container - -1. Inspect the created pod - -[embedmd]:# (../test/e2e/validate.sh shell /{ # Step 1/ /^}/) - -```shell -{ # Step 1: Inspect the pod -sudo crictl inspectp ${SANDBOX_ID} -} -``` - -1. Inspect the nginx container - -[embedmd]:# (../test/e2e/validate.sh shell /{ # Step 2/ /^}/) - -```shell -{ # Step 2: Inspect the container -sudo crictl inspect ${CONTAINER_ID} -} -``` - -1. Verify that nginx is running in gVisor - -[embedmd]:# (../test/e2e/validate.sh shell /{ # Step 3/ /^}/) - -```shell -{ # Step 3: Check dmesg -sudo crictl exec ${CONTAINER_ID} dmesg | grep -i gvisor -} -``` - -### Set up the Kubernetes Runtime Class - -1. Install the Runtime Class for gVisor - -[embedmd]:# (../test/e2e/runtimeclass-install.sh shell /{ # Step 1/ /^}/) - -```shell -{ # Step 1: Install a RuntimeClass -cat <<EOF | kubectl apply -f - -apiVersion: node.k8s.io/v1beta1 -kind: RuntimeClass -metadata: - name: gvisor -handler: runsc -EOF -} -``` - -1. Create a Pod with the gVisor Runtime Class - -[embedmd]:# (../test/e2e/runtimeclass-install.sh shell /{ # Step 2/ /^}/) - -```shell -{ # Step 2: Create a pod -cat <<EOF | kubectl apply -f - -apiVersion: v1 -kind: Pod -metadata: - name: nginx-gvisor -spec: - runtimeClassName: gvisor - containers: - - name: nginx - image: nginx -EOF -} -``` - -1. Verify that the Pod is running - -[embedmd]:# (../test/e2e/runtimeclass-install.sh shell /{ # Step 3/ /^}/) - -```shell -{ # Step 3: Get the pod -kubectl get pod nginx-gvisor -o wide -} -``` diff --git a/test/benchmarks/base/BUILD b/test/benchmarks/base/BUILD new file mode 100644 index 000000000..5e099d0f9 --- /dev/null +++ b/test/benchmarks/base/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "base", + testonly = 1, + srcs = [ + "base.go", + ], + deps = ["//test/benchmarks/harness"], +) + +go_test( + name = "base_test", + size = "large", + srcs = [ + "size_test.go", + "startup_test.go", + "sysbench_test.go", + ], + library = ":base", + tags = [ + # Requires docker and runsc to be configured before test runs. + "manual", + "local", + ], + deps = [ + "//pkg/test/dockerutil", + "//test/benchmarks/harness", + "//test/benchmarks/tools", + ], +) diff --git a/test/benchmarks/base/base.go b/test/benchmarks/base/base.go new file mode 100644 index 000000000..7bac52ff1 --- /dev/null +++ b/test/benchmarks/base/base.go @@ -0,0 +1,31 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package base holds base performance benchmarks. +package base + +import ( + "os" + "testing" + + "gvisor.dev/gvisor/test/benchmarks/harness" +) + +var testHarness harness.Harness + +// TestMain is the main method for package network. +func TestMain(m *testing.M) { + testHarness.Init() + os.Exit(m.Run()) +} diff --git a/test/benchmarks/base/size_test.go b/test/benchmarks/base/size_test.go new file mode 100644 index 000000000..3c1364faf --- /dev/null +++ b/test/benchmarks/base/size_test.go @@ -0,0 +1,220 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package base + +import ( + "context" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/test/benchmarks/harness" + "gvisor.dev/gvisor/test/benchmarks/tools" +) + +// BenchmarkSizeEmpty creates N empty containers and reads memory usage from +// /proc/meminfo. +func BenchmarkSizeEmpty(b *testing.B) { + machine, err := testHarness.GetMachine() + if err != nil { + b.Fatalf("failed to get machine: %v", err) + } + defer machine.CleanUp() + meminfo := tools.Meminfo{} + ctx := context.Background() + containers := make([]*dockerutil.Container, 0, b.N) + + // DropCaches before the test. + harness.DropCaches(machine) + + // Check available memory on 'machine'. + cmd, args := meminfo.MakeCmd() + before, err := machine.RunCommand(cmd, args...) + if err != nil { + b.Fatalf("failed to get meminfo: %v", err) + } + + // Make N containers. + for i := 0; i < b.N; i++ { + container := machine.GetContainer(ctx, b) + containers = append(containers, container) + if err := container.Spawn(ctx, dockerutil.RunOpts{ + Image: "benchmarks/alpine", + }, "sh", "-c", "echo Hello && sleep 1000"); err != nil { + cleanUpContainers(ctx, containers) + b.Fatalf("failed to run container: %v", err) + } + if _, err := container.WaitForOutputSubmatch(ctx, "Hello", 5*time.Second); err != nil { + cleanUpContainers(ctx, containers) + b.Fatalf("failed to read container output: %v", err) + } + } + + // Drop caches again before second measurement. + harness.DropCaches(machine) + + // Check available memory after containers are up. + after, err := machine.RunCommand(cmd, args...) + cleanUpContainers(ctx, containers) + if err != nil { + b.Fatalf("failed to get meminfo: %v", err) + } + meminfo.Report(b, before, after) +} + +// BenchmarkSizeNginx starts N containers running Nginx, checks that they're +// serving, and checks memory used based on /proc/meminfo. +func BenchmarkSizeNginx(b *testing.B) { + machine, err := testHarness.GetMachine() + if err != nil { + b.Fatalf("failed to get machine with: %v", err) + } + defer machine.CleanUp() + + // DropCaches for the first measurement. + harness.DropCaches(machine) + + // Measure MemAvailable before creating containers. + meminfo := tools.Meminfo{} + cmd, args := meminfo.MakeCmd() + before, err := machine.RunCommand(cmd, args...) + if err != nil { + b.Fatalf("failed to run meminfo command: %v", err) + } + + // Make N Nginx containers. + ctx := context.Background() + runOpts := dockerutil.RunOpts{ + Image: "benchmarks/nginx", + } + const port = 80 + servers := startServers(ctx, b, + serverArgs{ + machine: machine, + port: port, + runOpts: runOpts, + }) + defer cleanUpContainers(ctx, servers) + + // DropCaches after servers are created. + harness.DropCaches(machine) + // Take after measurement. + after, err := machine.RunCommand(cmd, args...) + if err != nil { + b.Fatalf("failed to run meminfo command: %v", err) + } + meminfo.Report(b, before, after) +} + +// BenchmarkSizeNode starts N containers running a Node app, checks that +// they're serving, and checks memory used based on /proc/meminfo. +func BenchmarkSizeNode(b *testing.B) { + machine, err := testHarness.GetMachine() + if err != nil { + b.Fatalf("failed to get machine with: %v", err) + } + defer machine.CleanUp() + + // Make a redis instance for Node to connect. + ctx := context.Background() + redis, redisIP := redisInstance(ctx, b, machine) + defer redis.CleanUp(ctx) + + // DropCaches after redis is created. + harness.DropCaches(machine) + + // Take before measurement. + meminfo := tools.Meminfo{} + cmd, args := meminfo.MakeCmd() + before, err := machine.RunCommand(cmd, args...) + if err != nil { + b.Fatalf("failed to run meminfo commend: %v", err) + } + + // Create N Node servers. + runOpts := dockerutil.RunOpts{ + Image: "benchmarks/node", + WorkDir: "/usr/src/app", + Links: []string{redis.MakeLink("redis")}, + } + nodeCmd := []string{"node", "index.js", redisIP.String()} + const port = 8080 + servers := startServers(ctx, b, + serverArgs{ + machine: machine, + port: port, + runOpts: runOpts, + cmd: nodeCmd, + }) + defer cleanUpContainers(ctx, servers) + + // DropCaches after servers are created. + harness.DropCaches(machine) + // Take after measurement. + cmd, args = meminfo.MakeCmd() + after, err := machine.RunCommand(cmd, args...) + if err != nil { + b.Fatalf("failed to run meminfo command: %v", err) + } + meminfo.Report(b, before, after) +} + +// serverArgs wraps args for startServers and runServerWorkload. +type serverArgs struct { + machine harness.Machine + port int + runOpts dockerutil.RunOpts + cmd []string +} + +// startServers starts b.N containers defined by 'runOpts' and 'cmd' and uses +// 'machine' to check that each is up. +func startServers(ctx context.Context, b *testing.B, args serverArgs) []*dockerutil.Container { + b.Helper() + servers := make([]*dockerutil.Container, 0, b.N) + + // Create N servers and wait until each of them is serving. + for i := 0; i < b.N; i++ { + server := args.machine.GetContainer(ctx, b) + servers = append(servers, server) + if err := server.Spawn(ctx, args.runOpts, args.cmd...); err != nil { + cleanUpContainers(ctx, servers) + b.Fatalf("failed to spawn node instance: %v", err) + } + + // Get the container IP. + servingIP, err := server.FindIP(ctx, false) + if err != nil { + cleanUpContainers(ctx, servers) + b.Fatalf("failed to get ip from server: %v", err) + } + + // Wait until the server is up. + if err := harness.WaitUntilServing(ctx, args.machine, servingIP, args.port); err != nil { + cleanUpContainers(ctx, servers) + b.Fatalf("failed to wait for serving") + } + } + return servers +} + +// cleanUpContainers cleans up a slice of containers. +func cleanUpContainers(ctx context.Context, containers []*dockerutil.Container) { + for _, c := range containers { + if c != nil { + c.CleanUp(ctx) + } + } +} diff --git a/test/benchmarks/base/startup_test.go b/test/benchmarks/base/startup_test.go new file mode 100644 index 000000000..4628a0a41 --- /dev/null +++ b/test/benchmarks/base/startup_test.go @@ -0,0 +1,156 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package base + +import ( + "context" + "fmt" + "net" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/test/benchmarks/harness" +) + +// BenchmarkStartEmpty times startup time for an empty container. +func BenchmarkStartupEmpty(b *testing.B) { + machine, err := testHarness.GetMachine() + if err != nil { + b.Fatalf("failed to get machine: %v", err) + } + defer machine.CleanUp() + + ctx := context.Background() + for i := 0; i < b.N; i++ { + container := machine.GetContainer(ctx, b) + defer container.CleanUp(ctx) + if _, err := container.Run(ctx, dockerutil.RunOpts{ + Image: "benchmarks/alpine", + }, "true"); err != nil { + b.Fatalf("failed to run container: %v", err) + } + } +} + +// BenchmarkStartupNginx times startup for a Nginx instance. +// Time is measured from start until the first request is served. +func BenchmarkStartupNginx(b *testing.B) { + // The machine to hold Nginx and the Node Server. + machine, err := testHarness.GetMachine() + if err != nil { + b.Fatalf("failed to get machine with: %v", err) + } + defer machine.CleanUp() + + ctx := context.Background() + runOpts := dockerutil.RunOpts{ + Image: "benchmarks/nginx", + } + runServerWorkload(ctx, b, + serverArgs{ + machine: machine, + runOpts: runOpts, + port: 80, + }) +} + +// BenchmarkStartupNode times startup for a Node application instance. +// Time is measured from start until the first request is served. +// Note that the Node app connects to a Redis instance before serving. +func BenchmarkStartupNode(b *testing.B) { + machine, err := testHarness.GetMachine() + if err != nil { + b.Fatalf("failed to get machine with: %v", err) + } + defer machine.CleanUp() + + ctx := context.Background() + redis, redisIP := redisInstance(ctx, b, machine) + defer redis.CleanUp(ctx) + runOpts := dockerutil.RunOpts{ + Image: "benchmarks/node", + WorkDir: "/usr/src/app", + Links: []string{redis.MakeLink("redis")}, + } + + cmd := []string{"node", "index.js", redisIP.String()} + runServerWorkload(ctx, b, + serverArgs{ + machine: machine, + port: 8080, + runOpts: runOpts, + cmd: cmd, + }) +} + +// redisInstance returns a Redis container and its reachable IP. +func redisInstance(ctx context.Context, b *testing.B, machine harness.Machine) (*dockerutil.Container, net.IP) { + b.Helper() + // Spawn a redis instance for the app to use. + redis := machine.GetNativeContainer(ctx, b) + if err := redis.Spawn(ctx, dockerutil.RunOpts{ + Image: "benchmarks/redis", + }); err != nil { + redis.CleanUp(ctx) + b.Fatalf("failed to spwan redis instance: %v", err) + } + + if out, err := redis.WaitForOutput(ctx, "Ready to accept connections", 3*time.Second); err != nil { + redis.CleanUp(ctx) + b.Fatalf("failed to start redis server: %v %s", err, out) + } + redisIP, err := redis.FindIP(ctx, false) + if err != nil { + redis.CleanUp(ctx) + b.Fatalf("failed to get IP from redis instance: %v", err) + } + return redis, redisIP +} + +// runServerWorkload runs a server workload defined by 'runOpts' and 'cmd'. +// 'clientMachine' is used to connect to the server on 'serverMachine'. +func runServerWorkload(ctx context.Context, b *testing.B, args serverArgs) { + b.Helper() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + if err := func() error { + server := args.machine.GetContainer(ctx, b) + defer func() { + b.StopTimer() + // Cleanup servers as we run so that we can go indefinitely. + server.CleanUp(ctx) + b.StartTimer() + }() + if err := server.Spawn(ctx, args.runOpts, args.cmd...); err != nil { + return fmt.Errorf("failed to spawn node instance: %v", err) + } + + servingIP, err := server.FindIP(ctx, false) + if err != nil { + return fmt.Errorf("failed to get ip from server: %v", err) + } + + // Wait until the Client sees the server as up. + if err := harness.WaitUntilServing(ctx, args.machine, servingIP, args.port); err != nil { + return fmt.Errorf("failed to wait for serving: %v", err) + } + return nil + }(); err != nil { + b.Fatal(err) + } + } +} diff --git a/test/benchmarks/base/sysbench_test.go b/test/benchmarks/base/sysbench_test.go new file mode 100644 index 000000000..6fb813640 --- /dev/null +++ b/test/benchmarks/base/sysbench_test.go @@ -0,0 +1,89 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package base + +import ( + "context" + "testing" + + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/test/benchmarks/tools" +) + +type testCase struct { + name string + test tools.Sysbench +} + +// BenchmarSysbench runs sysbench on the runtime. +func BenchmarkSysbench(b *testing.B) { + + testCases := []testCase{ + testCase{ + name: "CPU", + test: &tools.SysbenchCPU{ + Base: tools.SysbenchBase{ + Threads: 1, + Time: 5, + }, + MaxPrime: 50000, + }, + }, + testCase{ + name: "Memory", + test: &tools.SysbenchMemory{ + Base: tools.SysbenchBase{ + Threads: 1, + }, + BlockSize: "1M", + TotalSize: "500G", + }, + }, + testCase{ + name: "Mutex", + test: &tools.SysbenchMutex{ + Base: tools.SysbenchBase{ + Threads: 8, + }, + Loops: 1, + Locks: 10000000, + Num: 4, + }, + }, + } + + machine, err := testHarness.GetMachine() + if err != nil { + b.Fatalf("failed to get machine: %v", err) + } + defer machine.CleanUp() + + for _, tc := range testCases { + b.Run(tc.name, func(b *testing.B) { + + ctx := context.Background() + sysbench := machine.GetContainer(ctx, b) + defer sysbench.CleanUp(ctx) + + out, err := sysbench.Run(ctx, dockerutil.RunOpts{ + Image: "benchmarks/sysbench", + }, tc.test.MakeCmd()...) + if err != nil { + b.Fatalf("failed to run sysbench: %v: logs:%s", err, out) + } + tc.test.Report(b, out) + }) + } +} diff --git a/test/benchmarks/database/BUILD b/test/benchmarks/database/BUILD index 5e33465cd..6139f6e8a 100644 --- a/test/benchmarks/database/BUILD +++ b/test/benchmarks/database/BUILD @@ -12,9 +12,7 @@ go_library( go_test( name = "database_test", size = "enormous", - srcs = [ - "redis_test.go", - ], + srcs = ["redis_test.go"], library = ":database", tags = [ # Requires docker and runsc to be configured before test runs. @@ -24,5 +22,6 @@ go_test( deps = [ "//pkg/test/dockerutil", "//test/benchmarks/harness", + "//test/benchmarks/tools", ], ) diff --git a/test/benchmarks/database/redis_test.go b/test/benchmarks/database/redis_test.go index 6d39f4d66..394fce820 100644 --- a/test/benchmarks/database/redis_test.go +++ b/test/benchmarks/database/redis_test.go @@ -16,15 +16,12 @@ package database import ( "context" - "fmt" - "regexp" - "strconv" - "strings" "testing" "time" "gvisor.dev/gvisor/pkg/test/dockerutil" "gvisor.dev/gvisor/test/benchmarks/harness" + "gvisor.dev/gvisor/test/benchmarks/tools" ) // All possible operations from redis. Note: "ping" will @@ -99,16 +96,10 @@ func BenchmarkRedis(b *testing.B) { b.Fatalf("failed to start redis with: %v", err) } - // runs redis benchmark -t operation for 100K requests against server. - cmd := strings.Split( - fmt.Sprintf("redis-benchmark --csv -t %s -h %s -p %d", operation, ip, serverPort), " ") - - // There is no -t PING_BULK for redis-benchmark, so adjust the command in that case. - // Note that "ping" will run both PING_INLINE and PING_BULK. - if operation == "PING_BULK" { - cmd = strings.Split( - fmt.Sprintf("redis-benchmark --csv -t ping -h %s -p %d", ip, serverPort), " ") + redis := tools.Redis{ + Operation: operation, } + // Reset profiles and timer to begin the measurement. server.RestartProfiles() b.ResetTimer() @@ -117,81 +108,16 @@ func BenchmarkRedis(b *testing.B) { defer client.CleanUp(ctx) out, err := client.Run(ctx, dockerutil.RunOpts{ Image: "benchmarks/redis", - }, cmd...) + }, redis.MakeCmd(ip, serverPort)...) if err != nil { b.Fatalf("redis-benchmark failed with: %v", err) } // Stop time while we parse results. b.StopTimer() - result, err := parseOperation(operation, out) - if err != nil { - b.Fatalf("parsing result %s failed with err: %v", out, err) - } - b.ReportMetric(result, operation) // operations per second + redis.Report(b, out) b.StartTimer() } }) } } - -// parseOperation grabs the metric operations per second from redis-benchmark output. -func parseOperation(operation, data string) (float64, error) { - re := regexp.MustCompile(fmt.Sprintf(`"%s( .*)?","(\d*\.\d*)"`, operation)) - match := re.FindStringSubmatch(data) - // If no match, simply don't add it to the result map. - if len(match) < 3 { - return 0.0, fmt.Errorf("could not find %s in %s", operation, data) - } - return strconv.ParseFloat(match[2], 64) -} - -// TestParser tests the parser on sample data. -func TestParser(t *testing.T) { - sampleData := ` - "PING_INLINE","48661.80" - "PING_BULK","50301.81" - "SET","48923.68" - "GET","49382.71" - "INCR","49975.02" - "LPUSH","49875.31" - "RPUSH","50276.52" - "LPOP","50327.12" - "RPOP","50556.12" - "SADD","49504.95" - "HSET","49504.95" - "SPOP","50025.02" - "LPUSH (needed to benchmark LRANGE)","48875.86" - "LRANGE_100 (first 100 elements)","33955.86" - "LRANGE_300 (first 300 elements)","16550.81" - "LRANGE_500 (first 450 elements)","13653.74" - "LRANGE_600 (first 600 elements)","11219.57" - "MSET (10 keys)","44682.75" - ` - wants := map[string]float64{ - "PING_INLINE": 48661.80, - "PING_BULK": 50301.81, - "SET": 48923.68, - "GET": 49382.71, - "INCR": 49975.02, - "LPUSH": 49875.31, - "RPUSH": 50276.52, - "LPOP": 50327.12, - "RPOP": 50556.12, - "SADD": 49504.95, - "HSET": 49504.95, - "SPOP": 50025.02, - "LRANGE_100": 33955.86, - "LRANGE_300": 16550.81, - "LRANGE_500": 13653.74, - "LRANGE_600": 11219.57, - "MSET": 44682.75, - } - for op, want := range wants { - if got, err := parseOperation(op, sampleData); err != nil { - t.Fatalf("failed to parse %s: %v", op, err) - } else if want != got { - t.Fatalf("wanted %f for op %s, got %f", want, op, got) - } - } -} diff --git a/test/benchmarks/fs/BUILD b/test/benchmarks/fs/BUILD index 79327b57c..20654d88f 100644 --- a/test/benchmarks/fs/BUILD +++ b/test/benchmarks/fs/BUILD @@ -25,6 +25,7 @@ go_test( deps = [ "//pkg/test/dockerutil", "//test/benchmarks/harness", + "//test/benchmarks/tools", "@com_github_docker_docker//api/types/mount:go_default_library", ], ) diff --git a/test/benchmarks/fs/bazel_test.go b/test/benchmarks/fs/bazel_test.go index 3854aa87c..f4236ba37 100644 --- a/test/benchmarks/fs/bazel_test.go +++ b/test/benchmarks/fs/bazel_test.go @@ -24,7 +24,18 @@ import ( ) // Note: CleanCache versions of this test require running with root permissions. -func BenchmarkABSL(b *testing.B) { +func BenchmarkBuildABSL(b *testing.B) { + runBuildBenchmark(b, "benchmarks/absl", "/abseil-cpp", "absl/base/...") +} + +// Note: CleanCache versions of this test require running with root permissions. +// Note: This test takes on the order of 10m per permutation for runsc on kvm. +func BenchmarkBuildRunsc(b *testing.B) { + runBuildBenchmark(b, "benchmarks/runsc", "/gvisor", "runsc:runsc") +} + +func runBuildBenchmark(b *testing.B, image, workdir, target string) { + b.Helper() // Get a machine from the Harness on which to run. machine, err := h.GetMachine() if err != nil { @@ -51,20 +62,18 @@ func BenchmarkABSL(b *testing.B) { container := machine.GetContainer(ctx, b) defer container.CleanUp(ctx) - workdir := "/abseil-cpp" - // Start a container and sleep by an order of b.N. if err := container.Spawn(ctx, dockerutil.RunOpts{ - Image: "benchmarks/absl", + Image: image, }, "sleep", fmt.Sprintf("%d", 1000000)); err != nil { b.Fatalf("run failed with: %v", err) } // If we are running on a tmpfs, copy to /tmp which is a tmpfs. if bm.tmpfs { - if _, err := container.Exec(ctx, dockerutil.ExecOpts{}, - "cp", "-r", "/abseil-cpp", "/tmp/."); err != nil { - b.Fatal("failed to copy directory: %v", err) + if out, err := container.Exec(ctx, dockerutil.ExecOpts{}, + "cp", "-r", workdir, "/tmp/."); err != nil { + b.Fatal("failed to copy directory: %v %s", err, out) } workdir = "/tmp" + workdir } @@ -86,7 +95,7 @@ func BenchmarkABSL(b *testing.B) { got, err := container.Exec(ctx, dockerutil.ExecOpts{ WorkDir: workdir, - }, "bazel", "build", "-c", "opt", "absl/base/...") + }, "bazel", "build", "-c", "opt", target) if err != nil { b.Fatalf("build failed with: %v", err) } diff --git a/test/benchmarks/fs/fio_test.go b/test/benchmarks/fs/fio_test.go index 75d52726a..65874ed8b 100644 --- a/test/benchmarks/fs/fio_test.go +++ b/test/benchmarks/fs/fio_test.go @@ -15,72 +15,47 @@ package fs import ( "context" - "encoding/json" "fmt" "path/filepath" - "strconv" "strings" "testing" "github.com/docker/docker/api/types/mount" "gvisor.dev/gvisor/pkg/test/dockerutil" "gvisor.dev/gvisor/test/benchmarks/harness" + "gvisor.dev/gvisor/test/benchmarks/tools" ) -type fioTestCase struct { - test string // test to run: read, write, randread, randwrite. - size string // total size to be read/written of format N[GMK] (e.g. 5G). - blocksize string // blocksize to be read/write of format N[GMK] (e.g. 4K). - iodepth int // iodepth for reads/writes. - time int // time to run the test in seconds, usually for rand(read/write). -} - -// makeCmdFromTestcase makes a fio command. -func (f *fioTestCase) makeCmdFromTestcase(filename string) []string { - cmd := []string{"fio", "--output-format=json", "--ioengine=sync"} - cmd = append(cmd, fmt.Sprintf("--name=%s", f.test)) - cmd = append(cmd, fmt.Sprintf("--size=%s", f.size)) - cmd = append(cmd, fmt.Sprintf("--blocksize=%s", f.blocksize)) - cmd = append(cmd, fmt.Sprintf("--filename=%s", filename)) - cmd = append(cmd, fmt.Sprintf("--iodepth=%d", f.iodepth)) - cmd = append(cmd, fmt.Sprintf("--rw=%s", f.test)) - if f.time != 0 { - cmd = append(cmd, "--time_based") - cmd = append(cmd, fmt.Sprintf("--runtime=%d", f.time)) - } - return cmd -} - // BenchmarkFio runs fio on the runtime under test. There are 4 basic test // cases each run on a tmpfs mount and a bind mount. Fio requires root so that // caches can be dropped. func BenchmarkFio(b *testing.B) { - testCases := []fioTestCase{ - fioTestCase{ - test: "write", - size: "5G", - blocksize: "1M", - iodepth: 4, + testCases := []tools.Fio{ + tools.Fio{ + Test: "write", + Size: "5G", + Blocksize: "1M", + Iodepth: 4, }, - fioTestCase{ - test: "read", - size: "5G", - blocksize: "1M", - iodepth: 4, + tools.Fio{ + Test: "read", + Size: "5G", + Blocksize: "1M", + Iodepth: 4, }, - fioTestCase{ - test: "randwrite", - size: "5G", - blocksize: "4K", - iodepth: 4, - time: 30, + tools.Fio{ + Test: "randwrite", + Size: "5G", + Blocksize: "4K", + Iodepth: 4, + Time: 30, }, - fioTestCase{ - test: "randread", - size: "5G", - blocksize: "4K", - iodepth: 4, - time: 30, + tools.Fio{ + Test: "randread", + Size: "5G", + Blocksize: "4K", + Iodepth: 4, + Time: 30, }, } @@ -92,7 +67,7 @@ func BenchmarkFio(b *testing.B) { for _, fsType := range []mount.Type{mount.TypeBind, mount.TypeTmpfs} { for _, tc := range testCases { - testName := strings.Title(tc.test) + strings.Title(string(fsType)) + testName := strings.Title(tc.Test) + strings.Title(string(fsType)) b.Run(testName, func(b *testing.B) { ctx := context.Background() container := machine.GetContainer(ctx, b) @@ -109,7 +84,6 @@ func BenchmarkFio(b *testing.B) { b.Fatalf("failed to make mount: %v", err) } defer mountCleanup() - cmd := tc.makeCmdFromTestcase(outfile) // Start the container with the mount. if err := container.Spawn( @@ -127,8 +101,8 @@ func BenchmarkFio(b *testing.B) { } // For reads, we need a file to read so make one inside the container. - if strings.Contains(tc.test, "read") { - fallocateCmd := fmt.Sprintf("fallocate -l %s %s", tc.size, outfile) + if strings.Contains(tc.Test, "read") { + fallocateCmd := fmt.Sprintf("fallocate -l %s %s", tc.Size, outfile) if out, err := container.Exec(ctx, dockerutil.ExecOpts{}, strings.Split(fallocateCmd, " ")...); err != nil { b.Fatalf("failed to create readable file on mount: %v, %s", err, out) @@ -139,6 +113,7 @@ func BenchmarkFio(b *testing.B) { if err := harness.DropCaches(machine); err != nil { b.Skipf("failed to drop caches with %v. You probably need root.", err) } + cmd := tc.MakeCmd(outfile) container.RestartProfiles() b.ResetTimer() for i := 0; i < b.N; i++ { @@ -148,19 +123,7 @@ func BenchmarkFio(b *testing.B) { b.Fatalf("failed to run cmd %v: %v", cmd, err) } b.StopTimer() - // Parse the output and report the metrics. - isRead := strings.Contains(tc.test, "read") - bw, err := parseBandwidth(data, isRead) - if err != nil { - b.Fatalf("failed to parse bandwidth from %s with: %v", data, err) - } - b.ReportMetric(bw, "bandwidth") // in b/s. - - iops, err := parseIOps(data, isRead) - if err != nil { - b.Fatalf("failed to parse iops from %s with: %v", data, err) - } - b.ReportMetric(iops, "iops") + tc.Report(b, data) // If b.N is used (i.e. we run for an hour), we should drop caches // after each run. if err := harness.DropCaches(machine); err != nil { @@ -205,165 +168,3 @@ func makeMount(machine harness.Machine, mountType mount.Type, target string) (mo return mount.Mount{}, func() {}, fmt.Errorf("illegal mount time not supported: %v", mountType) } } - -// parseBandwidth reports the bandwidth in b/s. -func parseBandwidth(data string, isRead bool) (float64, error) { - if isRead { - result, err := parseFioJSON(data, "read", "bw") - if err != nil { - return 0, err - } - return 1024 * result, nil - } - result, err := parseFioJSON(data, "write", "bw") - if err != nil { - return 0, err - } - return 1024 * result, nil -} - -// parseIOps reports the write IO per second metric. -func parseIOps(data string, isRead bool) (float64, error) { - if isRead { - return parseFioJSON(data, "read", "iops") - } - return parseFioJSON(data, "write", "iops") -} - -// fioResult is for parsing FioJSON. -type fioResult struct { - Jobs []fioJob -} - -// fioJob is for parsing FioJSON. -type fioJob map[string]json.RawMessage - -// fioMetrics is for parsing FioJSON. -type fioMetrics map[string]json.RawMessage - -// parseFioJSON parses data and grabs "op" (read or write) and "metric" -// (bw or iops) from the JSON. -func parseFioJSON(data, op, metric string) (float64, error) { - var result fioResult - if err := json.Unmarshal([]byte(data), &result); err != nil { - return 0, fmt.Errorf("could not unmarshal data: %v", err) - } - - if len(result.Jobs) < 1 { - return 0, fmt.Errorf("no jobs present to parse") - } - - var metrics fioMetrics - if err := json.Unmarshal(result.Jobs[0][op], &metrics); err != nil { - return 0, fmt.Errorf("could not unmarshal jobs: %v", err) - } - - if _, ok := metrics[metric]; !ok { - return 0, fmt.Errorf("no metric found for op: %s", op) - } - return strconv.ParseFloat(string(metrics[metric]), 64) -} - -// TestParsers tests that the parsers work on sampleData. -func TestParsers(t *testing.T) { - sampleData := ` -{ - "fio version" : "fio-3.1", - "timestamp" : 1554837456, - "timestamp_ms" : 1554837456621, - "time" : "Tue Apr 9 19:17:36 2019", - "jobs" : [ - { - "jobname" : "test", - "groupid" : 0, - "error" : 0, - "eta" : 2147483647, - "elapsed" : 1, - "job options" : { - "name" : "test", - "ioengine" : "sync", - "size" : "1073741824", - "filename" : "/disk/file.dat", - "iodepth" : "4", - "bs" : "4096", - "rw" : "write" - }, - "read" : { - "io_bytes" : 0, - "io_kbytes" : 0, - "bw" : 123456, - "iops" : 1234.5678, - "runtime" : 0, - "total_ios" : 0, - "short_ios" : 0, - "bw_min" : 0, - "bw_max" : 0, - "bw_agg" : 0.000000, - "bw_mean" : 0.000000, - "bw_dev" : 0.000000, - "bw_samples" : 0, - "iops_min" : 0, - "iops_max" : 0, - "iops_mean" : 0.000000, - "iops_stddev" : 0.000000, - "iops_samples" : 0 - }, - "write" : { - "io_bytes" : 1073741824, - "io_kbytes" : 1048576, - "bw" : 1753471, - "iops" : 438367.892977, - "runtime" : 598, - "total_ios" : 262144, - "bw_min" : 1731120, - "bw_max" : 1731120, - "bw_agg" : 98.725328, - "bw_mean" : 1731120.000000, - "bw_dev" : 0.000000, - "bw_samples" : 1, - "iops_min" : 432780, - "iops_max" : 432780, - "iops_mean" : 432780.000000, - "iops_stddev" : 0.000000, - "iops_samples" : 1 - } - } - ] -} -` - // WriteBandwidth. - got, err := parseBandwidth(sampleData, false) - var want float64 = 1753471.0 * 1024 - if err != nil { - t.Fatalf("parse failed with err: %v", err) - } else if got != want { - t.Fatalf("got: %f, want: %f", got, want) - } - - // ReadBandwidth. - got, err = parseBandwidth(sampleData, true) - want = 123456 * 1024 - if err != nil { - t.Fatalf("parse failed with err: %v", err) - } else if got != want { - t.Fatalf("got: %f, want: %f", got, want) - } - - // WriteIOps. - got, err = parseIOps(sampleData, false) - want = 438367.892977 - if err != nil { - t.Fatalf("parse failed with err: %v", err) - } else if got != want { - t.Fatalf("got: %f, want: %f", got, want) - } - - // ReadIOps. - got, err = parseIOps(sampleData, true) - want = 1234.5678 - if err != nil { - t.Fatalf("parse failed with err: %v", err) - } else if got != want { - t.Fatalf("got: %f, want: %f", got, want) - } -} diff --git a/test/benchmarks/harness/util.go b/test/benchmarks/harness/util.go index bc551c582..86b863f78 100644 --- a/test/benchmarks/harness/util.go +++ b/test/benchmarks/harness/util.go @@ -23,23 +23,25 @@ import ( "gvisor.dev/gvisor/pkg/test/testutil" ) +//TODO(gvisor.dev/issue/3535): move to own package or move methods to harness struct. + // WaitUntilServing grabs a container from `machine` and waits for a server at // IP:port. func WaitUntilServing(ctx context.Context, machine Machine, server net.IP, port int) error { - var logger testutil.DefaultLogger = "netcat" + var logger testutil.DefaultLogger = "util" netcat := machine.GetNativeContainer(ctx, logger) defer netcat.CleanUp(ctx) - cmd := fmt.Sprintf("while ! nc -zv %s %d; do true; done", server, port) + cmd := fmt.Sprintf("while ! wget -q --spider http://%s:%d; do true; done", server, port) _, err := netcat.Run(ctx, dockerutil.RunOpts{ - Image: "packetdrill", + Image: "benchmarks/util", }, "sh", "-c", cmd) return err } // DropCaches drops caches on the provided machine. Requires root. func DropCaches(machine Machine) error { - if out, err := machine.RunCommand("/bin/sh", "-c", "sync | sysctl vm.drop_caches=3"); err != nil { + if out, err := machine.RunCommand("/bin/sh", "-c", "sync && sysctl vm.drop_caches=3"); err != nil { return fmt.Errorf("failed to drop caches: %v logs: %s", err, out) } return nil diff --git a/test/benchmarks/network/BUILD b/test/benchmarks/network/BUILD index b47400590..df5ff7265 100644 --- a/test/benchmarks/network/BUILD +++ b/test/benchmarks/network/BUILD @@ -15,7 +15,9 @@ go_test( srcs = [ "httpd_test.go", "iperf_test.go", + "nginx_test.go", "node_test.go", + "ruby_test.go", ], library = ":network", tags = [ @@ -27,5 +29,6 @@ go_test( "//pkg/test/dockerutil", "//pkg/test/testutil", "//test/benchmarks/harness", + "//test/benchmarks/tools", ], ) diff --git a/test/benchmarks/network/httpd_test.go b/test/benchmarks/network/httpd_test.go index fe23ca949..336e04c91 100644 --- a/test/benchmarks/network/httpd_test.go +++ b/test/benchmarks/network/httpd_test.go @@ -16,12 +16,11 @@ package network import ( "context" "fmt" - "regexp" - "strconv" "testing" "gvisor.dev/gvisor/pkg/test/dockerutil" "gvisor.dev/gvisor/test/benchmarks/harness" + "gvisor.dev/gvisor/test/benchmarks/tools" ) // see Dockerfile '//images/benchmarks/httpd'. @@ -52,20 +51,35 @@ func BenchmarkHttpdConcurrency(b *testing.B) { defer serverMachine.CleanUp() // The test iterates over client concurrency, so set other parameters. - requests := 10000 - concurrency := []int{1, 5, 10, 25} - doc := docs["10Kb"] + concurrency := []int{1, 25, 50, 100, 1000} for _, c := range concurrency { b.Run(fmt.Sprintf("%d", c), func(b *testing.B) { - runHttpd(b, clientMachine, serverMachine, doc, requests, c) + hey := &tools.Hey{ + Requests: 10000, + Concurrency: c, + Doc: docs["10Kb"], + } + runHttpd(b, clientMachine, serverMachine, hey, false /* reverse */) }) } } // BenchmarkHttpdDocSize iterates over different sized payloads, testing how -// well the runtime handles different payload sizes. +// well the runtime handles sending different payload sizes. func BenchmarkHttpdDocSize(b *testing.B) { + benchmarkHttpdDocSize(b, false /* reverse */) +} + +// BenchmarkReverseHttpdDocSize iterates over different sized payloads, testing +// how well the runtime handles receiving different payload sizes. +func BenchmarkReverseHttpdDocSize(b *testing.B) { + benchmarkHttpdDocSize(b, true /* reverse */) +} + +func benchmarkHttpdDocSize(b *testing.B, reverse bool) { + b.Helper() + clientMachine, err := h.GetMachine() if err != nil { b.Fatalf("failed to get machine: %v", err) @@ -78,31 +92,42 @@ func BenchmarkHttpdDocSize(b *testing.B) { } defer serverMachine.CleanUp() - requests := 10000 - concurrency := 1 - for name, filename := range docs { - b.Run(name, func(b *testing.B) { - runHttpd(b, clientMachine, serverMachine, filename, requests, concurrency) - }) + concurrency := []int{1, 25, 50, 100, 1000} + for _, c := range concurrency { + b.Run(fmt.Sprintf("%s_%d", name, c), func(b *testing.B) { + hey := &tools.Hey{ + Requests: 10000, + Concurrency: c, + Doc: filename, + } + runHttpd(b, clientMachine, serverMachine, hey, reverse) + }) + } } } // runHttpd runs a single test run. -func runHttpd(b *testing.B, clientMachine, serverMachine harness.Machine, doc string, requests, concurrency int) { +func runHttpd(b *testing.B, clientMachine, serverMachine harness.Machine, hey *tools.Hey, reverse bool) { b.Helper() // Grab a container from the server. ctx := context.Background() - server := serverMachine.GetContainer(ctx, b) + var server *dockerutil.Container + if reverse { + server = serverMachine.GetNativeContainer(ctx, b) + } else { + server = serverMachine.GetContainer(ctx, b) + } + defer server.CleanUp(ctx) // Copy the docs to /tmp and serve from there. - cmd := "mkdir -p /tmp/html; cp -r /local /tmp/html/.; apache2 -X" + cmd := "mkdir -p /tmp/html; cp -r /local/* /tmp/html/.; apache2 -X" port := 80 // Start the server. - server.Spawn(ctx, dockerutil.RunOpts{ + if err := server.Spawn(ctx, dockerutil.RunOpts{ Image: "benchmarks/httpd", Ports: []int{port}, Env: []string{ @@ -113,7 +138,9 @@ func runHttpd(b *testing.B, clientMachine, serverMachine harness.Machine, doc st "APACHE_LOG_DIR=/tmp", "APACHE_PID_FILE=/tmp/apache.pid", }, - }, "sh", "-c", cmd) + }, "sh", "-c", cmd); err != nil { + b.Fatalf("failed to start server: %v", err) + } ip, err := serverMachine.IPAddress() if err != nil { @@ -128,150 +155,27 @@ func runHttpd(b *testing.B, clientMachine, serverMachine harness.Machine, doc st // Check the server is serving. harness.WaitUntilServing(ctx, clientMachine, ip, servingPort) + var client *dockerutil.Container // Grab a client. - client := clientMachine.GetNativeContainer(ctx, b) + if reverse { + client = clientMachine.GetContainer(ctx, b) + } else { + client = clientMachine.GetNativeContainer(ctx, b) + } defer client.CleanUp(ctx) - path := fmt.Sprintf("http://%s:%d/%s", ip, servingPort, doc) - // See apachebench (ab) for flags. - cmd = fmt.Sprintf("ab -n %d -c %d %s", requests, concurrency, path) - b.ResetTimer() server.RestartProfiles() for i := 0; i < b.N; i++ { out, err := client.Run(ctx, dockerutil.RunOpts{ - Image: "benchmarks/ab", - }, "sh", "-c", cmd) + Image: "benchmarks/hey", + }, hey.MakeCmd(ip, servingPort)...) if err != nil { b.Fatalf("run failed with: %v", err) } b.StopTimer() - - // Parse and report custom metrics. - transferRate, err := parseTransferRate(out) - if err != nil { - b.Logf("failed to parse transferrate: %v", err) - } - b.ReportMetric(transferRate*1024, "transfer_rate") // Convert from Kb/s to b/s. - - latency, err := parseLatency(out) - if err != nil { - b.Logf("failed to parse latency: %v", err) - } - b.ReportMetric(latency/1000, "mean_latency") // Convert from ms to s. - - reqPerSecond, err := parseRequestsPerSecond(out) - if err != nil { - b.Logf("failed to parse requests per second: %v", err) - } - b.ReportMetric(reqPerSecond, "requests_per_second") - + hey.Report(b, out) b.StartTimer() } } - -var transferRateRE = regexp.MustCompile(`Transfer rate:\s+(\d+\.?\d+?)\s+\[Kbytes/sec\]\s+received`) - -// parseTransferRate parses transfer rate from apachebench output. -func parseTransferRate(data string) (float64, error) { - match := transferRateRE.FindStringSubmatch(data) - if len(match) < 2 { - return 0, fmt.Errorf("failed get bandwidth: %s", data) - } - return strconv.ParseFloat(match[1], 64) -} - -var latencyRE = regexp.MustCompile(`Total:\s+\d+\s+(\d+)\s+(\d+\.?\d+?)\s+\d+\s+\d+\s`) - -// parseLatency parses latency from apachebench output. -func parseLatency(data string) (float64, error) { - match := latencyRE.FindStringSubmatch(data) - if len(match) < 2 { - return 0, fmt.Errorf("failed get bandwidth: %s", data) - } - return strconv.ParseFloat(match[1], 64) -} - -var requestsPerSecondRE = regexp.MustCompile(`Requests per second:\s+(\d+\.?\d+?)\s+`) - -// parseRequestsPerSecond parses requests per second from apachebench output. -func parseRequestsPerSecond(data string) (float64, error) { - match := requestsPerSecondRE.FindStringSubmatch(data) - if len(match) < 2 { - return 0, fmt.Errorf("failed get bandwidth: %s", data) - } - return strconv.ParseFloat(match[1], 64) -} - -// Sample output from apachebench. -const sampleData = `This is ApacheBench, Version 2.3 <$Revision: 1826891 $> -Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/ -Licensed to The Apache Software Foundation, http://www.apache.org/ - -Benchmarking 10.10.10.10 (be patient).....done - - -Server Software: Apache/2.4.38 -Server Hostname: 10.10.10.10 -Server Port: 80 - -Document Path: /latin10k.txt -Document Length: 210 bytes - -Concurrency Level: 1 -Time taken for tests: 0.180 seconds -Complete requests: 100 -Failed requests: 0 -Non-2xx responses: 100 -Total transferred: 38800 bytes -HTML transferred: 21000 bytes -Requests per second: 556.44 [#/sec] (mean) -Time per request: 1.797 [ms] (mean) -Time per request: 1.797 [ms] (mean, across all concurrent requests) -Transfer rate: 210.84 [Kbytes/sec] received - -Connection Times (ms) - min mean[+/-sd] median max -Connect: 0 0 0.2 0 2 -Processing: 1 2 1.0 1 8 -Waiting: 1 1 1.0 1 7 -Total: 1 2 1.2 1 10 - -Percentage of the requests served within a certain time (ms) - 50% 1 - 66% 2 - 75% 2 - 80% 2 - 90% 2 - 95% 3 - 98% 7 - 99% 10 - 100% 10 (longest request)` - -// TestParsers checks the parsers work. -func TestParsers(t *testing.T) { - want := 210.84 - got, err := parseTransferRate(sampleData) - if err != nil { - t.Fatalf("failed to parse transfer rate with error: %v", err) - } else if got != want { - t.Fatalf("parseTransferRate got: %f, want: %f", got, want) - } - - want = 2.0 - got, err = parseLatency(sampleData) - if err != nil { - t.Fatalf("failed to parse transfer rate with error: %v", err) - } else if got != want { - t.Fatalf("parseLatency got: %f, want: %f", got, want) - } - - want = 556.44 - got, err = parseRequestsPerSecond(sampleData) - if err != nil { - t.Fatalf("failed to parse transfer rate with error: %v", err) - } else if got != want { - t.Fatalf("parseRequestsPerSecond got: %f, want: %f", got, want) - } -} diff --git a/test/benchmarks/network/iperf_test.go b/test/benchmarks/network/iperf_test.go index a5e198e14..b8ab7dfb8 100644 --- a/test/benchmarks/network/iperf_test.go +++ b/test/benchmarks/network/iperf_test.go @@ -15,19 +15,18 @@ package network import ( "context" - "fmt" - "regexp" - "strconv" - "strings" "testing" "gvisor.dev/gvisor/pkg/test/dockerutil" "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/test/benchmarks/harness" + "gvisor.dev/gvisor/test/benchmarks/tools" ) func BenchmarkIperf(b *testing.B) { - const time = 10 // time in seconds to run the client. + iperf := tools.Iperf{ + Time: 10, // time in seconds to run client. + } clientMachine, err := h.GetMachine() if err != nil { @@ -92,10 +91,6 @@ func BenchmarkIperf(b *testing.B) { if err := harness.WaitUntilServing(ctx, clientMachine, ip, servingPort); err != nil { b.Fatalf("failed to wait for server: %v", err) } - - // iperf report in Kb realtime - cmd := fmt.Sprintf("iperf -f K --realtime --time %d -c %s -p %d", time, ip.String(), servingPort) - // Run the client. b.ResetTimer() @@ -105,46 +100,14 @@ func BenchmarkIperf(b *testing.B) { for i := 0; i < b.N; i++ { out, err := client.Run(ctx, dockerutil.RunOpts{ Image: "benchmarks/iperf", - }, strings.Split(cmd, " ")...) + }, iperf.MakeCmd(ip, servingPort)...) if err != nil { b.Fatalf("failed to run client: %v", err) } b.StopTimer() - - // Parse bandwidth and report it. - bW, err := bandwidth(out) - if err != nil { - b.Fatalf("failed to parse bandwitdth from %s: %v", out, err) - } - b.ReportMetric(bW*1024, "bandwidth") // Convert from Kb/s to b/s. + iperf.Report(b, out) b.StartTimer() } }) } } - -// bandwidth parses the Bandwidth number from an iperf report. A sample is below. -func bandwidth(data string) (float64, error) { - re := regexp.MustCompile(`\[\s*\d+\][^\n]+\s+(\d+\.?\d*)\s+KBytes/sec`) - match := re.FindStringSubmatch(data) - if len(match) < 1 { - return 0, fmt.Errorf("failed get bandwidth: %s", data) - } - return strconv.ParseFloat(match[1], 64) -} - -func TestParser(t *testing.T) { - sampleData := ` ------------------------------------------------------------- -Client connecting to 10.138.15.215, TCP port 32779 -TCP window size: 45.0 KByte (default) ------------------------------------------------------------- -[ 3] local 10.138.15.216 port 32866 connected with 10.138.15.215 port 32779 -[ ID] Interval Transfer Bandwidth -[ 3] 0.0-10.0 sec 459520 KBytes 45900 KBytes/sec -` - bandwidth, err := bandwidth(sampleData) - if err != nil || bandwidth != 45900 { - t.Fatalf("failed with: %v and %f", err, bandwidth) - } -} diff --git a/test/benchmarks/network/nginx_test.go b/test/benchmarks/network/nginx_test.go new file mode 100644 index 000000000..2bf1a3624 --- /dev/null +++ b/test/benchmarks/network/nginx_test.go @@ -0,0 +1,104 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package network + +import ( + "context" + "fmt" + "testing" + + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/test/benchmarks/harness" + "gvisor.dev/gvisor/test/benchmarks/tools" +) + +// BenchmarkNginxConcurrency iterates the concurrency argument and tests +// how well the runtime under test handles requests in parallel. +// TODO(gvisor.dev/issue/3536): Update with different doc sizes like Httpd. +func BenchmarkNginxConcurrency(b *testing.B) { + // Grab a machine for the client and server. + clientMachine, err := h.GetMachine() + if err != nil { + b.Fatalf("failed to get client: %v", err) + } + defer clientMachine.CleanUp() + + serverMachine, err := h.GetMachine() + if err != nil { + b.Fatalf("failed to get server: %v", err) + } + defer serverMachine.CleanUp() + + concurrency := []int{1, 5, 10, 25} + for _, c := range concurrency { + b.Run(fmt.Sprintf("%d", c), func(b *testing.B) { + hey := &tools.Hey{ + Requests: 10000, + Concurrency: c, + } + runNginx(b, clientMachine, serverMachine, hey) + }) + } +} + +// runHttpd runs a single test run. +func runNginx(b *testing.B, clientMachine, serverMachine harness.Machine, hey *tools.Hey) { + b.Helper() + + // Grab a container from the server. + ctx := context.Background() + server := serverMachine.GetContainer(ctx, b) + defer server.CleanUp(ctx) + + port := 80 + // Start the server. + if err := server.Spawn(ctx, + dockerutil.RunOpts{ + Image: "benchmarks/nginx", + Ports: []int{port}, + }); err != nil { + b.Fatalf("server failed to start: %v", err) + } + + ip, err := serverMachine.IPAddress() + if err != nil { + b.Fatalf("failed to find server ip: %v", err) + } + + servingPort, err := server.FindPort(ctx, port) + if err != nil { + b.Fatalf("failed to find server port %d: %v", port, err) + } + + // Check the server is serving. + harness.WaitUntilServing(ctx, clientMachine, ip, servingPort) + + // Grab a client. + client := clientMachine.GetNativeContainer(ctx, b) + defer client.CleanUp(ctx) + + b.ResetTimer() + server.RestartProfiles() + for i := 0; i < b.N; i++ { + out, err := client.Run(ctx, dockerutil.RunOpts{ + Image: "benchmarks/hey", + }, hey.MakeCmd(ip, servingPort)...) + if err != nil { + b.Fatalf("run failed with: %v", err) + } + b.StopTimer() + hey.Report(b, out) + b.StartTimer() + } +} diff --git a/test/benchmarks/network/node_test.go b/test/benchmarks/network/node_test.go index 2556f710f..52eb794c4 100644 --- a/test/benchmarks/network/node_test.go +++ b/test/benchmarks/network/node_test.go @@ -16,33 +16,33 @@ package network import ( "context" "fmt" - "regexp" - "strconv" - "strings" "testing" "time" "gvisor.dev/gvisor/pkg/test/dockerutil" "gvisor.dev/gvisor/test/benchmarks/harness" + "gvisor.dev/gvisor/test/benchmarks/tools" ) -// BenchmarkNode runs 10K requests using 'hey' against a Node server run on +// BenchmarkNode runs requests using 'hey' against a Node server run on // 'runtime'. The server responds to requests by grabbing some data in a // redis instance and returns the data in its reponse. The test loops through // increasing amounts of concurency for requests. func BenchmarkNode(b *testing.B) { - requests := 10000 concurrency := []int{1, 5, 10, 25} - for _, c := range concurrency { b.Run(fmt.Sprintf("Concurrency%d", c), func(b *testing.B) { - runNode(b, requests, c) + hey := &tools.Hey{ + Requests: b.N * c, // Requests b.N requests per thread. + Concurrency: c, + } + runNode(b, hey) }) } } // runNode runs the test for a given # of requests and concurrency. -func runNode(b *testing.B, requests, concurrency int) { +func runNode(b *testing.B, hey *tools.Hey) { b.Helper() // The machine to hold Redis and the Node Server. @@ -73,7 +73,7 @@ func runNode(b *testing.B, requests, concurrency int) { if out, err := redis.WaitForOutput(ctx, "Ready to accept connections", 3*time.Second); err != nil { b.Fatalf("failed to start redis server: %v %s", err, out) } - redisIP, err := redis.FindIP(ctx) + redisIP, err := redis.FindIP(ctx, false) if err != nil { b.Fatalf("failed to get IP from redis instance: %v", err) } @@ -106,156 +106,22 @@ func runNode(b *testing.B, requests, concurrency int) { // Wait until the Client sees the server as up. harness.WaitUntilServing(ctx, clientMachine, servingIP, servingPort) - heyCmd := strings.Split(fmt.Sprintf("hey -n %d -c %d http://%s:%d/", requests, concurrency, servingIP, servingPort), " ") + heyCmd := hey.MakeCmd(servingIP, servingPort) nodeApp.RestartProfiles() b.ResetTimer() - for i := 0; i < b.N; i++ { - // the client should run on Native. - client := clientMachine.GetNativeContainer(ctx, b) - out, err := client.Run(ctx, dockerutil.RunOpts{ - Image: "benchmarks/hey", - }, heyCmd...) - if err != nil { - b.Fatalf("hey container failed: %v logs: %s", err, out) - } - - // Stop the timer to parse the data and report stats. - b.StopTimer() - requests, err := parseHeyRequestsPerSecond(out) - if err != nil { - b.Fatalf("failed to parse requests per second: %v", err) - } - b.ReportMetric(requests, "requests_per_second") - - bw, err := parseHeyBandwidth(out) - if err != nil { - b.Fatalf("failed to parse bandwidth: %v", err) - } - b.ReportMetric(bw, "bandwidth") - - ave, err := parseHeyAverageLatency(out) - if err != nil { - b.Fatalf("failed to parse average latency: %v", err) - } - b.ReportMetric(ave, "average_latency") - b.StartTimer() - } -} - -var heyReqPerSecondRE = regexp.MustCompile(`Requests/sec:\s*(\d+\.?\d+?)\s+`) - -// parseHeyRequestsPerSecond finds requests per second from hey output. -func parseHeyRequestsPerSecond(data string) (float64, error) { - match := heyReqPerSecondRE.FindStringSubmatch(data) - if len(match) < 2 { - return 0, fmt.Errorf("failed get bandwidth: %s", data) - } - return strconv.ParseFloat(match[1], 64) -} - -var heyAverageLatencyRE = regexp.MustCompile(`Average:\s*(\d+\.?\d+?)\s+secs`) - -// parseHeyAverageLatency finds Average Latency in seconds form hey output. -func parseHeyAverageLatency(data string) (float64, error) { - match := heyAverageLatencyRE.FindStringSubmatch(data) - if len(match) < 2 { - return 0, fmt.Errorf("failed get average latency match%d : %s", len(match), data) - } - return strconv.ParseFloat(match[1], 64) -} - -var heySizePerRequestRE = regexp.MustCompile(`Size/request:\s*(\d+\.?\d+?)\s+bytes`) - -// parseHeyBandwidth computes bandwidth from request/sec * bytes/request -// and reports in bytes/second. -func parseHeyBandwidth(data string) (float64, error) { - match := heyReqPerSecondRE.FindStringSubmatch(data) - if len(match) < 2 { - return 0, fmt.Errorf("failed get requests per second: %s", data) - } - reqPerSecond, err := strconv.ParseFloat(match[1], 64) - if err != nil { - return 0, fmt.Errorf("failed to convert %s to float", match[1]) - } - - match = heySizePerRequestRE.FindStringSubmatch(data) - if len(match) < 2 { - return 0, fmt.Errorf("failed get average latency: %s", data) - } - requestSize, err := strconv.ParseFloat(match[1], 64) - return requestSize * reqPerSecond, err -} - -// TestHeyParsers tests that the parsers work with sample output. -func TestHeyParsers(t *testing.T) { - sampleData := ` - Summary: - Total: 2.2391 secs - Slowest: 1.6292 secs - Fastest: 0.0066 secs - Average: 0.5351 secs - Requests/sec: 89.3202 - - Total data: 841200 bytes - Size/request: 4206 bytes - - Response time histogram: - 0.007 [1] | - 0.169 [0] | - 0.331 [149] |â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– - 0.493 [0] | - 0.656 [0] | - 0.818 [0] | - 0.980 [0] | - 1.142 [0] | - 1.305 [0] | - 1.467 [49] |â– â– â– â– â– â– â– â– â– â– â– â– â– - 1.629 [1] | - - - Latency distribution: - 10% in 0.2149 secs - 25% in 0.2449 secs - 50% in 0.2703 secs - 75% in 1.3315 secs - 90% in 1.4045 secs - 95% in 1.4232 secs - 99% in 1.4362 secs - - Details (average, fastest, slowest): - DNS+dialup: 0.0002 secs, 0.0066 secs, 1.6292 secs - DNS-lookup: 0.0000 secs, 0.0000 secs, 0.0000 secs - req write: 0.0000 secs, 0.0000 secs, 0.0012 secs - resp wait: 0.5225 secs, 0.0064 secs, 1.4346 secs - resp read: 0.0122 secs, 0.0001 secs, 0.2006 secs - - Status code distribution: - [200] 200 responses - ` - want := 89.3202 - got, err := parseHeyRequestsPerSecond(sampleData) - if err != nil { - t.Fatalf("failed to parse request per second with: %v", err) - } else if got != want { - t.Fatalf("got: %f, want: %f", got, want) - } - - want = 89.3202 * 4206 - got, err = parseHeyBandwidth(sampleData) - if err != nil { - t.Fatalf("failed to parse bandwidth with: %v", err) - } else if got != want { - t.Fatalf("got: %f, want: %f", got, want) - } - - want = 0.5351 - got, err = parseHeyAverageLatency(sampleData) + // the client should run on Native. + client := clientMachine.GetNativeContainer(ctx, b) + out, err := client.Run(ctx, dockerutil.RunOpts{ + Image: "benchmarks/hey", + }, heyCmd...) if err != nil { - t.Fatalf("failed to parse average latency with: %v", err) - } else if got != want { - t.Fatalf("got: %f, want: %f", got, want) + b.Fatalf("hey container failed: %v logs: %s", err, out) } + // Stop the timer to parse the data and report stats. + b.StopTimer() + hey.Report(b, out) + b.StartTimer() } diff --git a/test/benchmarks/network/ruby_test.go b/test/benchmarks/network/ruby_test.go new file mode 100644 index 000000000..5e0b2b724 --- /dev/null +++ b/test/benchmarks/network/ruby_test.go @@ -0,0 +1,134 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package network + +import ( + "context" + "fmt" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/test/benchmarks/harness" + "gvisor.dev/gvisor/test/benchmarks/tools" +) + +// BenchmarkRuby runs requests using 'hey' against a ruby application server. +// On start, ruby app generates some random data and pushes it to a redis +// instance. On a request, the app grabs for random entries from the redis +// server, publishes it to a document, and returns the doc to the request. +func BenchmarkRuby(b *testing.B) { + concurrency := []int{1, 5, 10, 25} + for _, c := range concurrency { + b.Run(fmt.Sprintf("Concurrency%d", c), func(b *testing.B) { + hey := &tools.Hey{ + Requests: b.N * c, // b.N requests per thread. + Concurrency: c, + } + runRuby(b, hey) + }) + } +} + +// runRuby runs the test for a given # of requests and concurrency. +func runRuby(b *testing.B, hey *tools.Hey) { + b.Helper() + // The machine to hold Redis and the Ruby Server. + serverMachine, err := h.GetMachine() + if err != nil { + b.Fatal("failed to get machine with: %v", err) + } + defer serverMachine.CleanUp() + + // The machine to run 'hey'. + clientMachine, err := h.GetMachine() + if err != nil { + b.Fatal("failed to get machine with: %v", err) + } + defer clientMachine.CleanUp() + ctx := context.Background() + + // Spawn a redis instance for the app to use. + redis := serverMachine.GetNativeContainer(ctx, b) + if err := redis.Spawn(ctx, dockerutil.RunOpts{ + Image: "benchmarks/redis", + }); err != nil { + b.Fatalf("failed to spwan redis instance: %v", err) + } + defer redis.CleanUp(ctx) + + if out, err := redis.WaitForOutput(ctx, "Ready to accept connections", 3*time.Second); err != nil { + b.Fatalf("failed to start redis server: %v %s", err, out) + } + redisIP, err := redis.FindIP(ctx, false) + if err != nil { + b.Fatalf("failed to get IP from redis instance: %v", err) + } + + // Ruby runs on port 9292. + const port = 9292 + + // Start-up the Ruby server. + rubyApp := serverMachine.GetContainer(ctx, b) + if err := rubyApp.Spawn(ctx, dockerutil.RunOpts{ + Image: "benchmarks/ruby", + WorkDir: "/app", + Links: []string{redis.MakeLink("redis")}, + Ports: []int{port}, + Env: []string{ + fmt.Sprintf("PORT=%d", port), + "WEB_CONCURRENCY=20", + "WEB_MAX_THREADS=20", + "RACK_ENV=production", + fmt.Sprintf("HOST=%s", redisIP), + }, + User: "nobody", + }, "sh", "-c", "/usr/bin/puma"); err != nil { + b.Fatalf("failed to spawn node instance: %v", err) + } + defer rubyApp.CleanUp(ctx) + + servingIP, err := serverMachine.IPAddress() + if err != nil { + b.Fatalf("failed to get ip from server: %v", err) + } + + servingPort, err := rubyApp.FindPort(ctx, port) + if err != nil { + b.Fatalf("failed to port from node instance: %v", err) + } + + // Wait until the Client sees the server as up. + if err := harness.WaitUntilServing(ctx, clientMachine, servingIP, servingPort); err != nil { + b.Fatalf("failed to wait until serving: %v", err) + } + heyCmd := hey.MakeCmd(servingIP, servingPort) + rubyApp.RestartProfiles() + b.ResetTimer() + + // the client should run on Native. + client := clientMachine.GetNativeContainer(ctx, b) + defer client.CleanUp(ctx) + out, err := client.Run(ctx, dockerutil.RunOpts{ + Image: "benchmarks/hey", + }, heyCmd...) + if err != nil { + b.Fatalf("hey container failed: %v logs: %s", err, out) + } + + // Stop the timer to parse the data and report stats. + b.StopTimer() + hey.Report(b, out) + b.StartTimer() +} diff --git a/benchmarks/tcp/BUILD b/test/benchmarks/tcp/BUILD index 6dde7d9e6..6dde7d9e6 100644 --- a/benchmarks/tcp/BUILD +++ b/test/benchmarks/tcp/BUILD diff --git a/benchmarks/tcp/README.md b/test/benchmarks/tcp/README.md index 38e6e69f0..38e6e69f0 100644 --- a/benchmarks/tcp/README.md +++ b/test/benchmarks/tcp/README.md diff --git a/benchmarks/tcp/nsjoin.c b/test/benchmarks/tcp/nsjoin.c index 524b4d549..524b4d549 100644 --- a/benchmarks/tcp/nsjoin.c +++ b/test/benchmarks/tcp/nsjoin.c diff --git a/benchmarks/tcp/tcp_benchmark.sh b/test/benchmarks/tcp/tcp_benchmark.sh index ef04b4ace..ef04b4ace 100755 --- a/benchmarks/tcp/tcp_benchmark.sh +++ b/test/benchmarks/tcp/tcp_benchmark.sh diff --git a/benchmarks/tcp/tcp_proxy.go b/test/benchmarks/tcp/tcp_proxy.go index 4b7ca7a14..4b7ca7a14 100644 --- a/benchmarks/tcp/tcp_proxy.go +++ b/test/benchmarks/tcp/tcp_proxy.go diff --git a/test/benchmarks/tools/BUILD b/test/benchmarks/tools/BUILD new file mode 100644 index 000000000..e5734d85c --- /dev/null +++ b/test/benchmarks/tools/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "tools", + srcs = [ + "ab.go", + "fio.go", + "hey.go", + "iperf.go", + "meminfo.go", + "redis.go", + "sysbench.go", + "tools.go", + ], + visibility = ["//:sandbox"], +) + +go_test( + name = "tools_test", + size = "small", + srcs = [ + "ab_test.go", + "fio_test.go", + "hey_test.go", + "iperf_test.go", + "meminfo_test.go", + "redis_test.go", + "sysbench_test.go", + ], + library = ":tools", +) diff --git a/test/benchmarks/tools/ab.go b/test/benchmarks/tools/ab.go new file mode 100644 index 000000000..4cc9c3bce --- /dev/null +++ b/test/benchmarks/tools/ab.go @@ -0,0 +1,94 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import ( + "fmt" + "net" + "regexp" + "strconv" + "testing" +) + +// ApacheBench is for the client application ApacheBench. +type ApacheBench struct { + Requests int + Concurrency int + Doc string + // TODO(zkoopmans): support KeepAlive and pass option to enable. +} + +// MakeCmd makes an ApacheBench command. +func (a *ApacheBench) MakeCmd(ip net.IP, port int) []string { + path := fmt.Sprintf("http://%s:%d/%s", ip, port, a.Doc) + // See apachebench (ab) for flags. + cmd := fmt.Sprintf("ab -n %d -c %d %s", a.Requests, a.Concurrency, path) + return []string{"sh", "-c", cmd} +} + +// Report parses and reports metrics from ApacheBench output. +func (a *ApacheBench) Report(b *testing.B, output string) { + // Parse and report custom metrics. + transferRate, err := a.parseTransferRate(output) + if err != nil { + b.Logf("failed to parse transferrate: %v", err) + } + b.ReportMetric(transferRate*1024, "transfer_rate_b/s") // Convert from Kb/s to b/s. + + latency, err := a.parseLatency(output) + if err != nil { + b.Logf("failed to parse latency: %v", err) + } + b.ReportMetric(latency/1000, "mean_latency_secs") // Convert from ms to s. + + reqPerSecond, err := a.parseRequestsPerSecond(output) + if err != nil { + b.Logf("failed to parse requests per second: %v", err) + } + b.ReportMetric(reqPerSecond, "requests_per_second") +} + +var transferRateRE = regexp.MustCompile(`Transfer rate:\s+(\d+\.?\d+?)\s+\[Kbytes/sec\]\s+received`) + +// parseTransferRate parses transfer rate from ApacheBench output. +func (a *ApacheBench) parseTransferRate(data string) (float64, error) { + match := transferRateRE.FindStringSubmatch(data) + if len(match) < 2 { + return 0, fmt.Errorf("failed get bandwidth: %s", data) + } + return strconv.ParseFloat(match[1], 64) +} + +var latencyRE = regexp.MustCompile(`Total:\s+\d+\s+(\d+)\s+(\d+\.?\d+?)\s+\d+\s+\d+\s`) + +// parseLatency parses latency from ApacheBench output. +func (a *ApacheBench) parseLatency(data string) (float64, error) { + match := latencyRE.FindStringSubmatch(data) + if len(match) < 2 { + return 0, fmt.Errorf("failed get bandwidth: %s", data) + } + return strconv.ParseFloat(match[1], 64) +} + +var requestsPerSecondRE = regexp.MustCompile(`Requests per second:\s+(\d+\.?\d+?)\s+`) + +// parseRequestsPerSecond parses requests per second from ApacheBench output. +func (a *ApacheBench) parseRequestsPerSecond(data string) (float64, error) { + match := requestsPerSecondRE.FindStringSubmatch(data) + if len(match) < 2 { + return 0, fmt.Errorf("failed get bandwidth: %s", data) + } + return strconv.ParseFloat(match[1], 64) +} diff --git a/test/benchmarks/tools/ab_test.go b/test/benchmarks/tools/ab_test.go new file mode 100644 index 000000000..28ee66ec1 --- /dev/null +++ b/test/benchmarks/tools/ab_test.go @@ -0,0 +1,90 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import "testing" + +// TestApacheBench checks the ApacheBench parsers on sample output. +func TestApacheBench(t *testing.T) { + // Sample output from apachebench. + sampleData := `This is ApacheBench, Version 2.3 <$Revision: 1826891 $> +Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/ +Licensed to The Apache Software Foundation, http://www.apache.org/ + +Benchmarking 10.10.10.10 (be patient).....done + + +Server Software: Apache/2.4.38 +Server Hostname: 10.10.10.10 +Server Port: 80 + +Document Path: /latin10k.txt +Document Length: 210 bytes + +Concurrency Level: 1 +Time taken for tests: 0.180 seconds +Complete requests: 100 +Failed requests: 0 +Non-2xx responses: 100 +Total transferred: 38800 bytes +HTML transferred: 21000 bytes +Requests per second: 556.44 [#/sec] (mean) +Time per request: 1.797 [ms] (mean) +Time per request: 1.797 [ms] (mean, across all concurrent requests) +Transfer rate: 210.84 [Kbytes/sec] received + +Connection Times (ms) + min mean[+/-sd] median max +Connect: 0 0 0.2 0 2 +Processing: 1 2 1.0 1 8 +Waiting: 1 1 1.0 1 7 +Total: 1 2 1.2 1 10 + +Percentage of the requests served within a certain time (ms) + 50% 1 + 66% 2 + 75% 2 + 80% 2 + 90% 2 + 95% 3 + 98% 7 + 99% 10 + 100% 10 (longest request)` + + ab := ApacheBench{} + want := 210.84 + got, err := ab.parseTransferRate(sampleData) + if err != nil { + t.Fatalf("failed to parse transfer rate with error: %v", err) + } else if got != want { + t.Fatalf("parseTransferRate got: %f, want: %f", got, want) + } + + want = 2.0 + got, err = ab.parseLatency(sampleData) + if err != nil { + t.Fatalf("failed to parse transfer rate with error: %v", err) + } else if got != want { + t.Fatalf("parseLatency got: %f, want: %f", got, want) + } + + want = 556.44 + got, err = ab.parseRequestsPerSecond(sampleData) + if err != nil { + t.Fatalf("failed to parse transfer rate with error: %v", err) + } else if got != want { + t.Fatalf("parseRequestsPerSecond got: %f, want: %f", got, want) + } +} diff --git a/test/benchmarks/tools/fio.go b/test/benchmarks/tools/fio.go new file mode 100644 index 000000000..20000db16 --- /dev/null +++ b/test/benchmarks/tools/fio.go @@ -0,0 +1,124 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import ( + "encoding/json" + "fmt" + "strconv" + "strings" + "testing" +) + +// Fio makes 'fio' commands and parses their output. +type Fio struct { + Test string // test to run: read, write, randread, randwrite. + Size string // total size to be read/written of format N[GMK] (e.g. 5G). + Blocksize string // blocksize to be read/write of format N[GMK] (e.g. 4K). + Iodepth int // iodepth for reads/writes. + Time int // time to run the test in seconds, usually for rand(read/write). +} + +// MakeCmd makes a 'fio' command. +func (f *Fio) MakeCmd(filename string) []string { + cmd := []string{"fio", "--output-format=json", "--ioengine=sync"} + cmd = append(cmd, fmt.Sprintf("--name=%s", f.Test)) + cmd = append(cmd, fmt.Sprintf("--size=%s", f.Size)) + cmd = append(cmd, fmt.Sprintf("--blocksize=%s", f.Blocksize)) + cmd = append(cmd, fmt.Sprintf("--filename=%s", filename)) + cmd = append(cmd, fmt.Sprintf("--iodepth=%d", f.Iodepth)) + cmd = append(cmd, fmt.Sprintf("--rw=%s", f.Test)) + if f.Time != 0 { + cmd = append(cmd, "--time_based") + cmd = append(cmd, fmt.Sprintf("--runtime=%d", f.Time)) + } + return cmd +} + +// Report reports metrics based on output from an 'fio' command. +func (f *Fio) Report(b *testing.B, output string) { + b.Helper() + // Parse the output and report the metrics. + isRead := strings.Contains(f.Test, "read") + bw, err := f.parseBandwidth(output, isRead) + if err != nil { + b.Fatalf("failed to parse bandwidth from %s with: %v", output, err) + } + b.ReportMetric(bw, "bandwidth_b/s") // in b/s. + + iops, err := f.parseIOps(output, isRead) + if err != nil { + b.Fatalf("failed to parse iops from %s with: %v", output, err) + } + b.ReportMetric(iops, "iops") +} + +// parseBandwidth reports the bandwidth in b/s. +func (f *Fio) parseBandwidth(data string, isRead bool) (float64, error) { + if isRead { + result, err := f.parseFioJSON(data, "read", "bw") + if err != nil { + return 0, err + } + return 1024 * result, nil + } + result, err := f.parseFioJSON(data, "write", "bw") + if err != nil { + return 0, err + } + return 1024 * result, nil +} + +// parseIOps reports the write IO per second metric. +func (f *Fio) parseIOps(data string, isRead bool) (float64, error) { + if isRead { + return f.parseFioJSON(data, "read", "iops") + } + return f.parseFioJSON(data, "write", "iops") +} + +// fioResult is for parsing FioJSON. +type fioResult struct { + Jobs []fioJob +} + +// fioJob is for parsing FioJSON. +type fioJob map[string]json.RawMessage + +// fioMetrics is for parsing FioJSON. +type fioMetrics map[string]json.RawMessage + +// parseFioJSON parses data and grabs "op" (read or write) and "metric" +// (bw or iops) from the JSON. +func (f *Fio) parseFioJSON(data, op, metric string) (float64, error) { + var result fioResult + if err := json.Unmarshal([]byte(data), &result); err != nil { + return 0, fmt.Errorf("could not unmarshal data: %v", err) + } + + if len(result.Jobs) < 1 { + return 0, fmt.Errorf("no jobs present to parse") + } + + var metrics fioMetrics + if err := json.Unmarshal(result.Jobs[0][op], &metrics); err != nil { + return 0, fmt.Errorf("could not unmarshal jobs: %v", err) + } + + if _, ok := metrics[metric]; !ok { + return 0, fmt.Errorf("no metric found for op: %s", op) + } + return strconv.ParseFloat(string(metrics[metric]), 64) +} diff --git a/test/benchmarks/tools/fio_test.go b/test/benchmarks/tools/fio_test.go new file mode 100644 index 000000000..a98277150 --- /dev/null +++ b/test/benchmarks/tools/fio_test.go @@ -0,0 +1,122 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import "testing" + +// TestFio checks the Fio parsers on sample output. +func TestFio(t *testing.T) { + sampleData := ` +{ + "fio version" : "fio-3.1", + "timestamp" : 1554837456, + "timestamp_ms" : 1554837456621, + "time" : "Tue Apr 9 19:17:36 2019", + "jobs" : [ + { + "jobname" : "test", + "groupid" : 0, + "error" : 0, + "eta" : 2147483647, + "elapsed" : 1, + "job options" : { + "name" : "test", + "ioengine" : "sync", + "size" : "1073741824", + "filename" : "/disk/file.dat", + "iodepth" : "4", + "bs" : "4096", + "rw" : "write" + }, + "read" : { + "io_bytes" : 0, + "io_kbytes" : 0, + "bw" : 123456, + "iops" : 1234.5678, + "runtime" : 0, + "total_ios" : 0, + "short_ios" : 0, + "bw_min" : 0, + "bw_max" : 0, + "bw_agg" : 0.000000, + "bw_mean" : 0.000000, + "bw_dev" : 0.000000, + "bw_samples" : 0, + "iops_min" : 0, + "iops_max" : 0, + "iops_mean" : 0.000000, + "iops_stddev" : 0.000000, + "iops_samples" : 0 + }, + "write" : { + "io_bytes" : 1073741824, + "io_kbytes" : 1048576, + "bw" : 1753471, + "iops" : 438367.892977, + "runtime" : 598, + "total_ios" : 262144, + "bw_min" : 1731120, + "bw_max" : 1731120, + "bw_agg" : 98.725328, + "bw_mean" : 1731120.000000, + "bw_dev" : 0.000000, + "bw_samples" : 1, + "iops_min" : 432780, + "iops_max" : 432780, + "iops_mean" : 432780.000000, + "iops_stddev" : 0.000000, + "iops_samples" : 1 + } + } + ] +} +` + fio := Fio{} + // WriteBandwidth. + got, err := fio.parseBandwidth(sampleData, false) + var want float64 = 1753471.0 * 1024 + if err != nil { + t.Fatalf("parse failed with err: %v", err) + } else if got != want { + t.Fatalf("got: %f, want: %f", got, want) + } + + // ReadBandwidth. + got, err = fio.parseBandwidth(sampleData, true) + want = 123456 * 1024 + if err != nil { + t.Fatalf("parse failed with err: %v", err) + } else if got != want { + t.Fatalf("got: %f, want: %f", got, want) + } + + // WriteIOps. + got, err = fio.parseIOps(sampleData, false) + want = 438367.892977 + if err != nil { + t.Fatalf("parse failed with err: %v", err) + } else if got != want { + t.Fatalf("got: %f, want: %f", got, want) + } + + // ReadIOps. + got, err = fio.parseIOps(sampleData, true) + want = 1234.5678 + if err != nil { + t.Fatalf("parse failed with err: %v", err) + } else if got != want { + t.Fatalf("got: %f, want: %f", got, want) + } +} diff --git a/test/benchmarks/tools/hey.go b/test/benchmarks/tools/hey.go new file mode 100644 index 000000000..b1e20e356 --- /dev/null +++ b/test/benchmarks/tools/hey.go @@ -0,0 +1,75 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import ( + "fmt" + "net" + "regexp" + "strconv" + "strings" + "testing" +) + +// Hey is for the client application 'hey'. +type Hey struct { + Requests int // Note: requests cannot be less than concurrency. + Concurrency int + Doc string +} + +// MakeCmd returns a 'hey' command. +func (h *Hey) MakeCmd(ip net.IP, port int) []string { + return strings.Split(fmt.Sprintf("hey -n %d -c %d http://%s:%d/%s", + h.Requests, h.Concurrency, ip, port, h.Doc), " ") +} + +// Report parses output from 'hey' and reports metrics. +func (h *Hey) Report(b *testing.B, output string) { + b.Helper() + requests, err := h.parseRequestsPerSecond(output) + if err != nil { + b.Fatalf("failed to parse requests per second: %v", err) + } + b.ReportMetric(requests, "requests_per_second") + + ave, err := h.parseAverageLatency(output) + if err != nil { + b.Fatalf("failed to parse average latency: %v", err) + } + b.ReportMetric(ave, "average_latency_secs") +} + +var heyReqPerSecondRE = regexp.MustCompile(`Requests/sec:\s*(\d+\.?\d+?)\s+`) + +// parseRequestsPerSecond finds requests per second from 'hey' output. +func (h *Hey) parseRequestsPerSecond(data string) (float64, error) { + match := heyReqPerSecondRE.FindStringSubmatch(data) + if len(match) < 2 { + return 0, fmt.Errorf("failed get bandwidth: %s", data) + } + return strconv.ParseFloat(match[1], 64) +} + +var heyAverageLatencyRE = regexp.MustCompile(`Average:\s*(\d+\.?\d+?)\s+secs`) + +// parseHeyAverageLatency finds Average Latency in seconds form 'hey' output. +func (h *Hey) parseAverageLatency(data string) (float64, error) { + match := heyAverageLatencyRE.FindStringSubmatch(data) + if len(match) < 2 { + return 0, fmt.Errorf("failed get average latency match%d : %s", len(match), data) + } + return strconv.ParseFloat(match[1], 64) +} diff --git a/test/benchmarks/tools/hey_test.go b/test/benchmarks/tools/hey_test.go new file mode 100644 index 000000000..e0cab1f52 --- /dev/null +++ b/test/benchmarks/tools/hey_test.go @@ -0,0 +1,81 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import "testing" + +// TestHey checks the Hey parsers on sample output. +func TestHey(t *testing.T) { + sampleData := ` + Summary: + Total: 2.2391 secs + Slowest: 1.6292 secs + Fastest: 0.0066 secs + Average: 0.5351 secs + Requests/sec: 89.3202 + + Total data: 841200 bytes + Size/request: 4206 bytes + + Response time histogram: + 0.007 [1] | + 0.169 [0] | + 0.331 [149] |â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– + 0.493 [0] | + 0.656 [0] | + 0.818 [0] | + 0.980 [0] | + 1.142 [0] | + 1.305 [0] | + 1.467 [49] |â– â– â– â– â– â– â– â– â– â– â– â– â– + 1.629 [1] | + + + Latency distribution: + 10% in 0.2149 secs + 25% in 0.2449 secs + 50% in 0.2703 secs + 75% in 1.3315 secs + 90% in 1.4045 secs + 95% in 1.4232 secs + 99% in 1.4362 secs + + Details (average, fastest, slowest): + DNS+dialup: 0.0002 secs, 0.0066 secs, 1.6292 secs + DNS-lookup: 0.0000 secs, 0.0000 secs, 0.0000 secs + req write: 0.0000 secs, 0.0000 secs, 0.0012 secs + resp wait: 0.5225 secs, 0.0064 secs, 1.4346 secs + resp read: 0.0122 secs, 0.0001 secs, 0.2006 secs + + Status code distribution: + [200] 200 responses + ` + hey := Hey{} + want := 89.3202 + got, err := hey.parseRequestsPerSecond(sampleData) + if err != nil { + t.Fatalf("failed to parse request per second with: %v", err) + } else if got != want { + t.Fatalf("got: %f, want: %f", got, want) + } + + want = 0.5351 + got, err = hey.parseAverageLatency(sampleData) + if err != nil { + t.Fatalf("failed to parse average latency with: %v", err) + } else if got != want { + t.Fatalf("got: %f, want: %f", got, want) + } +} diff --git a/test/benchmarks/tools/iperf.go b/test/benchmarks/tools/iperf.go new file mode 100644 index 000000000..df3d9349b --- /dev/null +++ b/test/benchmarks/tools/iperf.go @@ -0,0 +1,56 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import ( + "fmt" + "net" + "regexp" + "strconv" + "strings" + "testing" +) + +// Iperf is for the client side of `iperf`. +type Iperf struct { + Time int +} + +// MakeCmd returns a iperf client command. +func (i *Iperf) MakeCmd(ip net.IP, port int) []string { + // iperf report in Kb realtime + return strings.Split(fmt.Sprintf("iperf -f K --realtime --time %d -c %s -p %d", i.Time, ip, port), " ") +} + +// Report parses output from iperf client and reports metrics. +func (i *Iperf) Report(b *testing.B, output string) { + b.Helper() + // Parse bandwidth and report it. + bW, err := i.bandwidth(output) + if err != nil { + b.Fatalf("failed to parse bandwitdth from %s: %v", output, err) + } + b.ReportMetric(bW*1024, "bandwidth_b/s") // Convert from Kb/s to b/s. +} + +// bandwidth parses the Bandwidth number from an iperf report. A sample is below. +func (i *Iperf) bandwidth(data string) (float64, error) { + re := regexp.MustCompile(`\[\s*\d+\][^\n]+\s+(\d+\.?\d*)\s+KBytes/sec`) + match := re.FindStringSubmatch(data) + if len(match) < 1 { + return 0, fmt.Errorf("failed get bandwidth: %s", data) + } + return strconv.ParseFloat(match[1], 64) +} diff --git a/test/benchmarks/tools/iperf_test.go b/test/benchmarks/tools/iperf_test.go new file mode 100644 index 000000000..03bb30d05 --- /dev/null +++ b/test/benchmarks/tools/iperf_test.go @@ -0,0 +1,34 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package tools + +import "testing" + +// TestIperf checks the Iperf parsers on sample output. +func TestIperf(t *testing.T) { + sampleData := ` +------------------------------------------------------------ +Client connecting to 10.138.15.215, TCP port 32779 +TCP window size: 45.0 KByte (default) +------------------------------------------------------------ +[ 3] local 10.138.15.216 port 32866 connected with 10.138.15.215 port 32779 +[ ID] Interval Transfer Bandwidth +[ 3] 0.0-10.0 sec 459520 KBytes 45900 KBytes/sec +` + i := Iperf{} + bandwidth, err := i.bandwidth(sampleData) + if err != nil || bandwidth != 45900 { + t.Fatalf("failed with: %v and %f", err, bandwidth) + } +} diff --git a/test/benchmarks/tools/meminfo.go b/test/benchmarks/tools/meminfo.go new file mode 100644 index 000000000..2414a96a7 --- /dev/null +++ b/test/benchmarks/tools/meminfo.go @@ -0,0 +1,60 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import ( + "fmt" + "regexp" + "strconv" + "testing" +) + +// Meminfo wraps measurements of MemAvailable using /proc/meminfo. +type Meminfo struct { +} + +// MakeCmd returns a command for checking meminfo. +func (*Meminfo) MakeCmd() (string, []string) { + return "cat", []string{"/proc/meminfo"} +} + +// Report takes two reads of meminfo, parses them, and reports the difference +// divided by b.N. +func (*Meminfo) Report(b *testing.B, before, after string) { + b.Helper() + + beforeVal, err := parseMemAvailable(before) + if err != nil { + b.Fatalf("could not parse before value %s: %v", before, err) + } + + afterVal, err := parseMemAvailable(after) + if err != nil { + b.Fatalf("could not parse before value %s: %v", before, err) + } + val := 1024 * ((beforeVal - afterVal) / float64(b.N)) + b.ReportMetric(val, "average_container_size_bytes") +} + +var memInfoRE = regexp.MustCompile(`MemAvailable:\s*(\d+)\skB\n`) + +// parseMemAvailable grabs the MemAvailable number from /proc/meminfo. +func parseMemAvailable(data string) (float64, error) { + match := memInfoRE.FindStringSubmatch(data) + if len(match) < 2 { + return 0, fmt.Errorf("couldn't find MemAvailable in %s", data) + } + return strconv.ParseFloat(match[1], 64) +} diff --git a/test/benchmarks/tools/meminfo_test.go b/test/benchmarks/tools/meminfo_test.go new file mode 100644 index 000000000..ba803540f --- /dev/null +++ b/test/benchmarks/tools/meminfo_test.go @@ -0,0 +1,84 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import ( + "testing" +) + +// TestMeminfo checks the Meminfo parser on sample output. +func TestMeminfo(t *testing.T) { + sampleData := ` +MemTotal: 16337408 kB +MemFree: 3742696 kB +MemAvailable: 9319948 kB +Buffers: 1433884 kB +Cached: 4607036 kB +SwapCached: 45284 kB +Active: 8288376 kB +Inactive: 2685928 kB +Active(anon): 4724912 kB +Inactive(anon): 1047940 kB +Active(file): 3563464 kB +Inactive(file): 1637988 kB +Unevictable: 326940 kB +Mlocked: 48 kB +SwapTotal: 33292284 kB +SwapFree: 32865736 kB +Dirty: 708 kB +Writeback: 0 kB +AnonPages: 4304204 kB +Mapped: 975424 kB +Shmem: 910292 kB +KReclaimable: 744532 kB +Slab: 1058448 kB +SReclaimable: 744532 kB +SUnreclaim: 313916 kB +KernelStack: 25188 kB +PageTables: 65300 kB +NFS_Unstable: 0 kB +Bounce: 0 kB +WritebackTmp: 0 kB +CommitLimit: 41460988 kB +Committed_AS: 22859492 kB +VmallocTotal: 34359738367 kB +VmallocUsed: 63088 kB +VmallocChunk: 0 kB +Percpu: 9248 kB +HardwareCorrupted: 0 kB +AnonHugePages: 786432 kB +ShmemHugePages: 0 kB +ShmemPmdMapped: 0 kB +FileHugePages: 0 kB +FilePmdMapped: 0 kB +HugePages_Total: 0 +HugePages_Free: 0 +HugePages_Rsvd: 0 +HugePages_Surp: 0 +Hugepagesize: 2048 kB +Hugetlb: 0 kB +DirectMap4k: 5408532 kB +DirectMap2M: 11241472 kB +DirectMap1G: 1048576 kB +` + want := 9319948.0 + got, err := parseMemAvailable(sampleData) + if err != nil { + t.Fatalf("parseMemAvailable failed: %v", err) + } + if got != want { + t.Fatalf("parseMemAvailable got %f, want %f", got, want) + } +} diff --git a/test/benchmarks/tools/redis.go b/test/benchmarks/tools/redis.go new file mode 100644 index 000000000..c899ae0d4 --- /dev/null +++ b/test/benchmarks/tools/redis.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import ( + "fmt" + "net" + "regexp" + "strconv" + "strings" + "testing" +) + +// Redis is for the client 'redis-benchmark'. +type Redis struct { + Operation string +} + +// MakeCmd returns a redis-benchmark client command. +func (r *Redis) MakeCmd(ip net.IP, port int) []string { + // There is no -t PING_BULK for redis-benchmark, so adjust the command in that case. + // Note that "ping" will run both PING_INLINE and PING_BULK. + if r.Operation == "PING_BULK" { + return strings.Split( + fmt.Sprintf("redis-benchmark --csv -t ping -h %s -p %d", ip, port), " ") + } + + // runs redis-benchmark -t operation for 100K requests against server. + return strings.Split( + fmt.Sprintf("redis-benchmark --csv -t %s -h %s -p %d", r.Operation, ip, port), " ") +} + +// Report parses output from redis-benchmark client and reports metrics. +func (r *Redis) Report(b *testing.B, output string) { + b.Helper() + result, err := r.parseOperation(output) + if err != nil { + b.Fatalf("parsing result %s failed with err: %v", output, err) + } + b.ReportMetric(result, r.Operation) // operations per second +} + +// parseOperation grabs the metric operations per second from redis-benchmark output. +func (r *Redis) parseOperation(data string) (float64, error) { + re := regexp.MustCompile(fmt.Sprintf(`"%s( .*)?","(\d*\.\d*)"`, r.Operation)) + match := re.FindStringSubmatch(data) + if len(match) < 3 { + return 0.0, fmt.Errorf("could not find %s in %s", r.Operation, data) + } + return strconv.ParseFloat(match[2], 64) +} diff --git a/test/benchmarks/tools/redis_test.go b/test/benchmarks/tools/redis_test.go new file mode 100644 index 000000000..4bafda66f --- /dev/null +++ b/test/benchmarks/tools/redis_test.go @@ -0,0 +1,87 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import ( + "testing" +) + +// TestRedis checks the Redis parsers on sample output. +func TestRedis(t *testing.T) { + sampleData := ` + "PING_INLINE","48661.80" + "PING_BULK","50301.81" + "SET","48923.68" + "GET","49382.71" + "INCR","49975.02" + "LPUSH","49875.31" + "RPUSH","50276.52" + "LPOP","50327.12" + "RPOP","50556.12" + "SADD","49504.95" + "HSET","49504.95" + "SPOP","50025.02" + "LPUSH (needed to benchmark LRANGE)","48875.86" + "LRANGE_100 (first 100 elements)","33955.86" + "LRANGE_300 (first 300 elements)","16550.81"// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + + "LRANGE_500 (first 450 elements)","13653.74" + "LRANGE_600 (first 600 elements)","11219.57" + "MSET (10 keys)","44682.75" + ` + wants := map[string]float64{ + "PING_INLINE": 48661.80, + "PING_BULK": 50301.81, + "SET": 48923.68, + "GET": 49382.71, + "INCR": 49975.02, + "LPUSH": 49875.31, + "RPUSH": 50276.52, + "LPOP": 50327.12, + "RPOP": 50556.12, + "SADD": 49504.95, + "HSET": 49504.95, + "SPOP": 50025.02, + "LRANGE_100": 33955.86, + "LRANGE_300": 16550.81, + "LRANGE_500": 13653.74, + "LRANGE_600": 11219.57, + "MSET": 44682.75, + } + for op, want := range wants { + redis := Redis{ + Operation: op, + } + if got, err := redis.parseOperation(sampleData); err != nil { + t.Fatalf("failed to parse %s: %v", op, err) + } else if want != got { + t.Fatalf("wanted %f for op %s, got %f", want, op, got) + } + } +} diff --git a/test/benchmarks/tools/sysbench.go b/test/benchmarks/tools/sysbench.go new file mode 100644 index 000000000..6b2f75ca2 --- /dev/null +++ b/test/benchmarks/tools/sysbench.go @@ -0,0 +1,245 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import ( + "fmt" + "regexp" + "strconv" + "strings" + "testing" +) + +var warmup = "sysbench --threads=8 --memory-total-size=5G memory run > /dev/null &&" + +// Sysbench represents a 'sysbench' command. +type Sysbench interface { + MakeCmd() []string // Makes a sysbench command. + flags() []string + Report(*testing.B, string) // Reports results contained in string. +} + +// SysbenchBase is the top level struct for sysbench and holds top-level arguments +// for sysbench. See: 'sysbench --help' +type SysbenchBase struct { + Threads int // number of Threads for the test. + Time int // time limit for test in seconds. +} + +// baseFlags returns top level flags. +func (s *SysbenchBase) baseFlags() []string { + var ret []string + if s.Threads > 0 { + ret = append(ret, fmt.Sprintf("--threads=%d", s.Threads)) + } + if s.Time > 0 { + ret = append(ret, fmt.Sprintf("--time=%d", s.Time)) + } + return ret +} + +// SysbenchCPU is for 'sysbench [flags] cpu run' and holds CPU specific arguments. +type SysbenchCPU struct { + Base SysbenchBase + MaxPrime int // upper limit for primes generator [10000]. +} + +// MakeCmd makes commands for SysbenchCPU. +func (s *SysbenchCPU) MakeCmd() []string { + cmd := []string{warmup, "sysbench"} + cmd = append(cmd, s.flags()...) + cmd = append(cmd, "cpu run") + return []string{"sh", "-c", strings.Join(cmd, " ")} +} + +// flags makes flags for SysbenchCPU cmds. +func (s *SysbenchCPU) flags() []string { + cmd := s.Base.baseFlags() + if s.MaxPrime > 0 { + return append(cmd, fmt.Sprintf("--cpu-max-prime=%d", s.MaxPrime)) + } + return cmd +} + +// Report reports the relevant metrics for SysbenchCPU. +func (s *SysbenchCPU) Report(b *testing.B, output string) { + b.Helper() + result, err := s.parseEvents(output) + if err != nil { + b.Fatalf("parsing CPU events from %s failed: %v", output, err) + } + b.ReportMetric(result, "cpu_events_per_second") +} + +var cpuEventsPerSecondRE = regexp.MustCompile(`events per second:\s*(\d*.?\d*)\n`) + +// parseEvents parses cpu events per second. +func (s *SysbenchCPU) parseEvents(data string) (float64, error) { + match := cpuEventsPerSecondRE.FindStringSubmatch(data) + if len(match) < 2 { + return 0.0, fmt.Errorf("could not find events per second: %s", data) + } + return strconv.ParseFloat(match[1], 64) +} + +// SysbenchMemory is for 'sysbench [FLAGS] memory run' and holds Memory specific arguments. +type SysbenchMemory struct { + Base SysbenchBase + BlockSize string // size of test memory block [1K]. + TotalSize string // size of data to transfer [100G]. + Scope string // memory access scope {global, local} [global]. + HugeTLB bool // allocate memory from HugeTLB [off]. + OperationType string // type of memory ops {read, write, none} [write]. + AccessMode string // access mode {seq, rnd} [seq]. +} + +// MakeCmd makes commands for SysbenchMemory. +func (s *SysbenchMemory) MakeCmd() []string { + cmd := []string{warmup, "sysbench"} + cmd = append(cmd, s.flags()...) + cmd = append(cmd, "memory run") + return []string{"sh", "-c", strings.Join(cmd, " ")} +} + +// flags makes flags for SysbenchMemory cmds. +func (s *SysbenchMemory) flags() []string { + cmd := s.Base.baseFlags() + if s.BlockSize != "" { + cmd = append(cmd, fmt.Sprintf("--memory-block-size=%s", s.BlockSize)) + } + if s.TotalSize != "" { + cmd = append(cmd, fmt.Sprintf("--memory-total-size=%s", s.TotalSize)) + } + if s.Scope != "" { + cmd = append(cmd, fmt.Sprintf("--memory-scope=%s", s.Scope)) + } + if s.HugeTLB { + cmd = append(cmd, "--memory-hugetlb=on") + } + if s.OperationType != "" { + cmd = append(cmd, fmt.Sprintf("--memory-oper=%s", s.OperationType)) + } + if s.AccessMode != "" { + cmd = append(cmd, fmt.Sprintf("--memory-access-mode=%s", s.AccessMode)) + } + return cmd +} + +// Report reports the relevant metrics for SysbenchMemory. +func (s *SysbenchMemory) Report(b *testing.B, output string) { + b.Helper() + result, err := s.parseOperations(output) + if err != nil { + b.Fatalf("parsing result %s failed with err: %v", output, err) + } + b.ReportMetric(result, "operations_per_second") +} + +var memoryOperationsRE = regexp.MustCompile(`Total\soperations:\s+\d*\s*\((\d*\.\d*)\sper\ssecond\)`) + +// parseOperations parses memory operations per second form sysbench memory ouput. +func (s *SysbenchMemory) parseOperations(data string) (float64, error) { + match := memoryOperationsRE.FindStringSubmatch(data) + if len(match) < 2 { + return 0.0, fmt.Errorf("couldn't find memory operations per second: %s", data) + } + return strconv.ParseFloat(match[1], 64) +} + +// SysbenchMutex is for 'sysbench [FLAGS] mutex run' and holds Mutex specific arguments. +type SysbenchMutex struct { + Base SysbenchBase + Num int // total size of mutex array [4096]. + Locks int // number of mutex locks per thread [50K]. + Loops int // number of loops to do outside mutex lock [10K]. +} + +// MakeCmd makes commands for SysbenchMutex. +func (s *SysbenchMutex) MakeCmd() []string { + cmd := []string{warmup, "sysbench"} + cmd = append(cmd, s.flags()...) + cmd = append(cmd, "mutex run") + return []string{"sh", "-c", strings.Join(cmd, " ")} +} + +// flags makes flags for SysbenchMutex commands. +func (s *SysbenchMutex) flags() []string { + var cmd []string + cmd = append(cmd, s.Base.baseFlags()...) + if s.Num > 0 { + cmd = append(cmd, fmt.Sprintf("--mutex-num=%d", s.Num)) + } + if s.Locks > 0 { + cmd = append(cmd, fmt.Sprintf("--mutex-locks=%d", s.Locks)) + } + if s.Loops > 0 { + cmd = append(cmd, fmt.Sprintf("--mutex-loops=%d", s.Loops)) + } + return cmd +} + +// Report parses and reports relevant sysbench mutex metrics. +func (s *SysbenchMutex) Report(b *testing.B, output string) { + b.Helper() + + result, err := s.parseExecutionTime(output) + if err != nil { + b.Fatalf("parsing result %s failed with err: %v", output, err) + } + b.ReportMetric(result, "average_execution_time_secs") + + result, err = s.parseDeviation(output) + if err != nil { + b.Fatalf("parsing result %s failed with err: %v", output, err) + } + b.ReportMetric(result, "stdev_execution_time_secs") + + result, err = s.parseLatency(output) + if err != nil { + b.Fatalf("parsing result %s failed with err: %v", output, err) + } + b.ReportMetric(result/1000, "average_latency_secs") +} + +var executionTimeRE = regexp.MustCompile(`execution time \(avg/stddev\):\s*(\d*.?\d*)/(\d*.?\d*)`) + +// parseExecutionTime parses threads fairness average execution time from sysbench output. +func (s *SysbenchMutex) parseExecutionTime(data string) (float64, error) { + match := executionTimeRE.FindStringSubmatch(data) + if len(match) < 2 { + return 0.0, fmt.Errorf("could not find execution time average: %s", data) + } + return strconv.ParseFloat(match[1], 64) +} + +// parseDeviation parses threads fairness stddev time from sysbench output. +func (s *SysbenchMutex) parseDeviation(data string) (float64, error) { + match := executionTimeRE.FindStringSubmatch(data) + if len(match) < 3 { + return 0.0, fmt.Errorf("could not find execution time deviation: %s", data) + } + return strconv.ParseFloat(match[2], 64) +} + +var averageLatencyRE = regexp.MustCompile(`avg:[^\n^\d]*(\d*\.?\d*)`) + +// parseLatency parses latency from sysbench output. +func (s *SysbenchMutex) parseLatency(data string) (float64, error) { + match := averageLatencyRE.FindStringSubmatch(data) + if len(match) < 2 { + return 0.0, fmt.Errorf("could not find average latency: %s", data) + } + return strconv.ParseFloat(match[1], 64) +} diff --git a/test/benchmarks/tools/sysbench_test.go b/test/benchmarks/tools/sysbench_test.go new file mode 100644 index 000000000..850d1939e --- /dev/null +++ b/test/benchmarks/tools/sysbench_test.go @@ -0,0 +1,169 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tools + +import ( + "testing" +) + +// TestSysbenchCpu tests parses on sample 'sysbench cpu' output. +func TestSysbenchCpu(t *testing.T) { + sampleData := ` +sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3) + +Running the test with following options: +Number of threads: 8 +Initializing random number generator from current time + + +Prime numbers limit: 10000 + +Initializing worker threads... + +Threads started! + +CPU speed: + events per second: 9093.38 + +General statistics: + total time: 10.0007s + total number of events: 90949 + +Latency (ms): + min: 0.64 + avg: 0.88 + max: 24.65 + 95th percentile: 1.55 + sum: 79936.91 + +Threads fairness: + events (avg/stddev): 11368.6250/831.38 + execution time (avg/stddev): 9.9921/0.01 +` + sysbench := SysbenchCPU{} + want := 9093.38 + if got, err := sysbench.parseEvents(sampleData); err != nil { + t.Fatalf("parse cpu events failed: %v", err) + } else if want != got { + t.Fatalf("got: %f want: %f", got, want) + } +} + +// TestSysbenchMemory tests parsers on sample 'sysbench memory' output. +func TestSysbenchMemory(t *testing.T) { + sampleData := ` +sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3) + +Running the test with following options: +Number of threads: 8 +Initializing random number generator from current time + + +Running memory speed test with the following options: + block size: 1KiB + total size: 102400MiB + operation: write + scope: global + +Initializing worker threads... + +Threads started! + +Total operations: 47999046 (9597428.64 per second) + +46874.07 MiB transferred (9372.49 MiB/sec) + + +General statistics: + total time: 5.0001s + total number of events: 47999046 + +Latency (ms): + min: 0.00 + avg: 0.00 + max: 0.21 + 95th percentile: 0.00 + sum: 33165.91 + +Threads fairness: + events (avg/stddev): 5999880.7500/111242.52 + execution time (avg/stddev): 4.1457/0.09 +` + sysbench := SysbenchMemory{} + want := 9597428.64 + if got, err := sysbench.parseOperations(sampleData); err != nil { + t.Fatalf("parse memory ops failed: %v", err) + } else if want != got { + t.Fatalf("got: %f want: %f", got, want) + } +} + +// TestSysbenchMutex tests parsers on sample 'sysbench mutex' output. +func TestSysbenchMutex(t *testing.T) { + sampleData := ` +sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3) + +The 'mutex' test requires a command argument. See 'sysbench mutex help' +root@ec078132e294:/# sysbench mutex --threads=8 run +sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3) + +Running the test with following options: +Number of threads: 8 +Initializing random number generator from current time + + +Initializing worker threads... + +Threads started! + + +General statistics: + total time: 0.2320s + total number of events: 8 + +Latency (ms): + min: 152.35 + avg: 192.48 + max: 231.41 + 95th percentile: 231.53 + sum: 1539.83 + +Threads fairness: + events (avg/stddev): 1.0000/0.00 + execution time (avg/stddev): 0.1925/0.04 +` + + sysbench := SysbenchMutex{} + want := .1925 + if got, err := sysbench.parseExecutionTime(sampleData); err != nil { + t.Fatalf("parse mutex time failed: %v", err) + } else if want != got { + t.Fatalf("got: %f want: %f", got, want) + } + + want = 0.04 + if got, err := sysbench.parseDeviation(sampleData); err != nil { + t.Fatalf("parse mutex deviation failed: %v", err) + } else if want != got { + t.Fatalf("got: %f want: %f", got, want) + } + + want = 192.48 + if got, err := sysbench.parseLatency(sampleData); err != nil { + t.Fatalf("parse mutex time failed: %v", err) + } else if want != got { + t.Fatalf("got: %f want: %f", got, want) + } +} diff --git a/test/benchmarks/tools/tools.go b/test/benchmarks/tools/tools.go new file mode 100644 index 000000000..eb61c0136 --- /dev/null +++ b/test/benchmarks/tools/tools.go @@ -0,0 +1,17 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package tools holds tooling to couple command formatting and output parsers +// together. +package tools diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go index ef42b689a..71ec4791e 100644 --- a/test/e2e/integration_test.go +++ b/test/e2e/integration_test.go @@ -431,11 +431,57 @@ func TestHostOverlayfsCopyUp(t *testing.T) { d := dockerutil.MakeContainer(ctx, t) defer d.CleanUp(ctx) - if _, err := d.Run(ctx, dockerutil.RunOpts{ + if got, err := d.Run(ctx, dockerutil.RunOpts{ Image: "basic/hostoverlaytest", WorkDir: "/root", - }, "./test"); err != nil { + }, "./test_copy_up"); err != nil { t.Fatalf("docker run failed: %v", err) + } else if got != "" { + t.Errorf("test failed:\n%s", got) + } +} + +// TestHostOverlayfsRewindDir tests that rewinddir() "causes the directory +// stream to refer to the current state of the corresponding directory, as a +// call to opendir() would have done" as required by POSIX, when the directory +// in question is host overlayfs. +// +// This test specifically targets host overlayfs because, per POSIX, "if a file +// is removed from or added to the directory after the most recent call to +// opendir() or rewinddir(), whether a subsequent call to readdir() returns an +// entry for that file is unspecified"; the host filesystems used by other +// automated tests yield newly-added files from readdir() even if the fsgofer +// does not explicitly rewinddir(), but overlayfs does not. +func TestHostOverlayfsRewindDir(t *testing.T) { + ctx := context.Background() + d := dockerutil.MakeContainer(ctx, t) + defer d.CleanUp(ctx) + + if got, err := d.Run(ctx, dockerutil.RunOpts{ + Image: "basic/hostoverlaytest", + WorkDir: "/root", + }, "./test_rewinddir"); err != nil { + t.Fatalf("docker run failed: %v", err) + } else if got != "" { + t.Errorf("test failed:\n%s", got) + } +} + +// Basic test for linkat(2). Syscall tests requires CAP_DAC_READ_SEARCH and it +// cannot use tricks like userns as root. For this reason, run a basic link test +// to ensure some coverage. +func TestLink(t *testing.T) { + ctx := context.Background() + d := dockerutil.MakeContainer(ctx, t) + defer d.CleanUp(ctx) + + if got, err := d.Run(ctx, dockerutil.RunOpts{ + Image: "basic/linktest", + WorkDir: "/root", + }, "./link_test"); err != nil { + t.Fatalf("docker run failed: %v", err) + } else if got != "" { + t.Errorf("test failed:\n%s", got) } } diff --git a/test/fuse/BUILD b/test/fuse/BUILD new file mode 100644 index 000000000..34b950644 --- /dev/null +++ b/test/fuse/BUILD @@ -0,0 +1 @@ +package(licenses = ["notice"]) diff --git a/test/fuse/README.md b/test/fuse/README.md new file mode 100644 index 000000000..734c3a4e3 --- /dev/null +++ b/test/fuse/README.md @@ -0,0 +1,103 @@ +# gVisor FUSE Test Suite + +This is an integration test suite for fuse(4) filesystem. It runs under both +gVisor and Linux, and ensures compatibility between the two. This test suite is +based on system calls test. + +This document describes the framework of fuse integration test and the +guidelines that should be followed when adding new fuse tests. + +## Integration Test Framework + +Please refer to the figure below. `>` is entering the function, `<` is leaving +the function, and `=` indicates sequentially entering and leaving. + +``` + | Client (Test Main Process) | Server (FUSE Daemon) + | | + | >TEST_F() | + | >SetUp() | + | =MountFuse() | + | >SetUpFuseServer() | + | [create communication pipes] | + | =fork() | =fork() + | >WaitCompleted() | + | [wait for MarkDone()] | + | | =ConsumeFuseInit() + | | =MarkDone() + | <WaitCompleted() | + | <SetUpFuseServer() | + | <SetUp() | + | >SetExpected() | + | [construct expected reaction] | + | | >FuseLoop() + | | >ReceiveExpected() + | | [wait data from pipe] + | [write data to pipe] | + | [wait for MarkDone()] | + | | [save data to memory] + | | =MarkDone() + | <SetExpected() | + | | <ReceiveExpected() + | | >read() + | | [wait for fs operation] + | >[Do fs operation] | + | [wait for fs response] | + | | <read() + | | =CompareRequest() + | | =write() [write fs response] + | <[Do fs operation] | + | =[Test fs operation result] | + | =[wait for MarkDone()] | + | | =MarkDone() + | >TearDown() | + | =UnmountFuse() | + | <TearDown() | + | <TEST_F() | +``` + +## Running the tests + +Based on syscall tests, fuse tests can run in different environments. To enable +fuse testing environment, the test targets should be appended with `_fuse`. + +For example, to run fuse test in `stat_test.cc`: + +```bash +$ bazel test //test/fuse:stat_test_runsc_ptrace_vfs2_fuse +``` + +Test all targets tagged with fuse: + +```bash +$ bazel test --test_tag_filters=fuse //test/fuse/... +``` + +## Writing a new FUSE test + +1. Add test targets in `BUILD` and `linux/BUILD`. +2. Inherit your test from `FuseTest` base class. It allows you to: + - Run a fake FUSE server in background during each test setup. + - Create pipes for communication and provide utility functions. + - Stop FUSE server after test completes. +3. Customize your comparison function for request assessment in FUSE server. +4. Add the mapping of the size of structs if you are working on new FUSE + opcode. + - Please update `FuseTest::GetPayloadSize()` for each new FUSE opcode. +5. Build the expected request-response pair of your FUSE operation. +6. Call `SetExpected()` function to inject the expected reaction. +7. Check the response and/or errors. +8. Finally call `WaitCompleted()` to ensure the FUSE server acts correctly. + +A few customized matchers used in syscalls test are encouraged to test the +outcome of filesystem operations. Such as: + +```cc +SyscallSucceeds() +SyscallSucceedsWithValue(...) +SyscallFails() +SyscallFailsWithErrno(...) +``` + +Please refer to [test/syscalls/README.md](../syscalls/README.md) for further +details. diff --git a/test/fuse/linux/BUILD b/test/fuse/linux/BUILD new file mode 100644 index 000000000..49dc96c20 --- /dev/null +++ b/test/fuse/linux/BUILD @@ -0,0 +1,21 @@ +load("//tools:defs.bzl", "cc_library", "gtest") + +package( + default_visibility = ["//:sandbox"], + licenses = ["notice"], +) + +cc_library( + name = "fuse_base", + testonly = 1, + srcs = [ + "fuse_base.cc", + "fuse_base.h", + ], + deps = [ + gtest, + "//test/util:posix_error", + "//test/util:test_util", + "@com_google_absl//absl/strings:str_format", + ], +) diff --git a/test/fuse/linux/fuse_base.cc b/test/fuse/linux/fuse_base.cc new file mode 100644 index 000000000..ce69276c9 --- /dev/null +++ b/test/fuse/linux/fuse_base.cc @@ -0,0 +1,207 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fuse_base.h" + +#include <fcntl.h> +#include <linux/fuse.h> +#include <stdio.h> +#include <string.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <unistd.h> + +#include <iostream> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/strings/str_format.h" +#include "test/util/posix_error.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +void FuseTest::SetUp() { + MountFuse(); + SetUpFuseServer(); +} + +void FuseTest::TearDown() { UnmountFuse(); } + +// Since CompareRequest is running in background thread, gTest assertions and +// expectations won't directly reflect the test result. However, the FUSE +// background server still connects to the same standard I/O as testing main +// thread. So EXPECT_XX can still be used to show different results. To +// ensure failed testing result is observable, return false and the result +// will be sent to test main thread via pipe. +bool FuseTest::CompareRequest(void* expected_mem, size_t expected_len, + void* real_mem, size_t real_len) { + if (expected_len != real_len) return false; + return memcmp(expected_mem, real_mem, expected_len) == 0; +} + +// SetExpected is called by the testing main thread to set expected request- +// response pair of a single FUSE operation. +void FuseTest::SetExpected(struct iovec* iov_in, int iov_in_cnt, + struct iovec* iov_out, int iov_out_cnt) { + EXPECT_THAT(RetryEINTR(writev)(set_expected_[1], iov_in, iov_in_cnt), + SyscallSucceedsWithValue(::testing::Gt(0))); + WaitCompleted(); + + EXPECT_THAT(RetryEINTR(writev)(set_expected_[1], iov_out, iov_out_cnt), + SyscallSucceedsWithValue(::testing::Gt(0))); + WaitCompleted(); +} + +// WaitCompleted waits for the FUSE server to finish its job and check if it +// completes without errors. +void FuseTest::WaitCompleted() { + char success; + EXPECT_THAT(RetryEINTR(read)(done_[0], &success, sizeof(success)), + SyscallSucceedsWithValue(1)); +} + +void FuseTest::MountFuse() { + EXPECT_THAT(dev_fd_ = open("/dev/fuse", O_RDWR), SyscallSucceeds()); + + std::string mount_opts = absl::StrFormat("fd=%d,%s", dev_fd_, kMountOpts); + EXPECT_THAT(mount("fuse", kMountPoint, "fuse", MS_NODEV | MS_NOSUID, + mount_opts.c_str()), + SyscallSucceedsWithValue(0)); +} + +void FuseTest::UnmountFuse() { + EXPECT_THAT(umount(kMountPoint), SyscallSucceeds()); + // TODO(gvisor.dev/issue/3330): ensure the process is terminated successfully. +} + +// ConsumeFuseInit consumes the first FUSE request and returns the +// corresponding PosixError. +PosixError FuseTest::ConsumeFuseInit() { + RETURN_ERROR_IF_SYSCALL_FAIL( + RetryEINTR(read)(dev_fd_, buf_.data(), buf_.size())); + + struct iovec iov_out[2]; + struct fuse_out_header out_header = { + .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_init_out), + .error = 0, + .unique = 2, + }; + // Returns a fake fuse_init_out with 7.0 version to avoid ECONNREFUSED + // error in the initialization of FUSE connection. + struct fuse_init_out out_payload = { + .major = 7, + }; + iov_out[0].iov_len = sizeof(out_header); + iov_out[0].iov_base = &out_header; + iov_out[1].iov_len = sizeof(out_payload); + iov_out[1].iov_base = &out_payload; + + RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(writev)(dev_fd_, iov_out, 2)); + return NoError(); +} + +// ReceiveExpected reads 1 pair of expected fuse request-response `iovec`s +// from pipe and save them into member variables of this testing instance. +void FuseTest::ReceiveExpected() { + // Set expected fuse_in request. + EXPECT_THAT(len_in_ = RetryEINTR(read)(set_expected_[0], mem_in_.data(), + mem_in_.size()), + SyscallSucceedsWithValue(::testing::Gt(0))); + MarkDone(len_in_ > 0); + + // Set expected fuse_out response. + EXPECT_THAT(len_out_ = RetryEINTR(read)(set_expected_[0], mem_out_.data(), + mem_out_.size()), + SyscallSucceedsWithValue(::testing::Gt(0))); + MarkDone(len_out_ > 0); +} + +// MarkDone writes 1 byte of success indicator through pipe. +void FuseTest::MarkDone(bool success) { + char data = success ? 1 : 0; + EXPECT_THAT(RetryEINTR(write)(done_[1], &data, sizeof(data)), + SyscallSucceedsWithValue(1)); +} + +// FuseLoop is the implementation of the fake FUSE server. Read from /dev/fuse, +// compare the request by CompareRequest (use derived function if specified), +// and write the expected response to /dev/fuse. +void FuseTest::FuseLoop() { + bool success = true; + ssize_t len = 0; + while (true) { + ReceiveExpected(); + + EXPECT_THAT(len = RetryEINTR(read)(dev_fd_, buf_.data(), buf_.size()), + SyscallSucceedsWithValue(len_in_)); + if (len != len_in_) success = false; + + if (!CompareRequest(buf_.data(), len_in_, mem_in_.data(), len_in_)) { + std::cerr << "the FUSE request is not expected" << std::endl; + success = false; + } + + EXPECT_THAT(len = RetryEINTR(write)(dev_fd_, mem_out_.data(), len_out_), + SyscallSucceedsWithValue(len_out_)); + if (len != len_out_) success = false; + MarkDone(success); + } +} + +// SetUpFuseServer creates 2 pipes. First is for testing client to send the +// expected request-response pair, and the other acts as a checkpoint for the +// FUSE server to notify the client that it can proceed. +void FuseTest::SetUpFuseServer() { + ASSERT_THAT(pipe(set_expected_), SyscallSucceedsWithValue(0)); + ASSERT_THAT(pipe(done_), SyscallSucceedsWithValue(0)); + + switch (fork()) { + case -1: + GTEST_FAIL(); + return; + case 0: + break; + default: + ASSERT_THAT(close(set_expected_[0]), SyscallSucceedsWithValue(0)); + ASSERT_THAT(close(done_[1]), SyscallSucceedsWithValue(0)); + WaitCompleted(); + return; + } + + ASSERT_THAT(close(set_expected_[1]), SyscallSucceedsWithValue(0)); + ASSERT_THAT(close(done_[0]), SyscallSucceedsWithValue(0)); + + MarkDone(ConsumeFuseInit().ok()); + + FuseLoop(); + _exit(0); +} + +// GetPayloadSize is a helper function to get the number of bytes of a +// specific FUSE operation struct. +size_t FuseTest::GetPayloadSize(uint32_t opcode, bool in) { + switch (opcode) { + case FUSE_INIT: + return in ? sizeof(struct fuse_init_in) : sizeof(struct fuse_init_out); + default: + break; + } + return 0; +} + +} // namespace testing +} // namespace gvisor diff --git a/test/fuse/linux/fuse_base.h b/test/fuse/linux/fuse_base.h new file mode 100644 index 000000000..b008778de --- /dev/null +++ b/test/fuse/linux/fuse_base.h @@ -0,0 +1,97 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GVISOR_TEST_FUSE_FUSE_BASE_H_ +#define GVISOR_TEST_FUSE_FUSE_BASE_H_ + +#include <linux/fuse.h> +#include <sys/uio.h> + +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "test/util/posix_error.h" + +namespace gvisor { +namespace testing { + +constexpr char kMountPoint[] = "/mnt"; +constexpr char kMountOpts[] = "rootmode=755,user_id=0,group_id=0"; + +class FuseTest : public ::testing::Test { + public: + FuseTest() { + buf_.resize(FUSE_MIN_READ_BUFFER); + mem_in_.resize(FUSE_MIN_READ_BUFFER); + mem_out_.resize(FUSE_MIN_READ_BUFFER); + } + void SetUp() override; + void TearDown() override; + + // CompareRequest is used by the FUSE server and should be implemented to + // compare different FUSE operations. It compares the actual FUSE input + // request with the expected one set by `SetExpected()`. + virtual bool CompareRequest(void* expected_mem, size_t expected_len, + void* real_mem, size_t real_len); + + // SetExpected is called by the testing main thread. Writes a request- + // response pair into FUSE server's member variables via pipe. + void SetExpected(struct iovec* iov_in, int iov_in_cnt, struct iovec* iov_out, + int iov_out_cnt); + + // WaitCompleted waits for FUSE server to complete its processing. It + // complains if the FUSE server responds failure during tests. + void WaitCompleted(); + + private: + void MountFuse(); + void UnmountFuse(); + + // ConsumeFuseInit is only used during FUSE server setup. + PosixError ConsumeFuseInit(); + + // ReceiveExpected is the FUSE server side's corresponding code of + // `SetExpected()`. Save the request-response pair into its memory. + void ReceiveExpected(); + + // MarkDone is used by the FUSE server to tell testing main if it's OK to + // proceed next command. + void MarkDone(bool success); + + // FuseLoop is where the FUSE server stay until it is terminated. + void FuseLoop(); + + // SetUpFuseServer creates 2 pipes for communication and forks FUSE server. + void SetUpFuseServer(); + + // GetPayloadSize is a helper function to get the number of bytes of a + // specific FUSE operation struct. + size_t GetPayloadSize(uint32_t opcode, bool in); + + int dev_fd_; + int set_expected_[2]; + int done_[2]; + + std::vector<char> buf_; + std::vector<char> mem_in_; + std::vector<char> mem_out_; + ssize_t len_in_; + ssize_t len_out_; +}; + +} // namespace testing +} // namespace gvisor + +#endif // GVISOR_TEST_FUSE_FUSE_BASE_H_ diff --git a/test/iptables/BUILD b/test/iptables/BUILD index 40b63ebbe..66453772a 100644 --- a/test/iptables/BUILD +++ b/test/iptables/BUILD @@ -9,6 +9,7 @@ go_library( "filter_input.go", "filter_output.go", "iptables.go", + "iptables_unsafe.go", "iptables_util.go", "nat.go", ], diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go index 550b6198a..fda5f694f 100644 --- a/test/iptables/iptables_test.go +++ b/test/iptables/iptables_test.go @@ -371,3 +371,11 @@ func TestFilterAddrs(t *testing.T) { } } } + +func TestNATPreOriginalDst(t *testing.T) { + singleTest(t, NATPreOriginalDst{}) +} + +func TestNATOutOriginalDst(t *testing.T) { + singleTest(t, NATOutOriginalDst{}) +} diff --git a/test/iptables/iptables_unsafe.go b/test/iptables/iptables_unsafe.go new file mode 100644 index 000000000..bd85a8fea --- /dev/null +++ b/test/iptables/iptables_unsafe.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iptables + +import ( + "fmt" + "syscall" + "unsafe" +) + +type originalDstError struct { + errno syscall.Errno +} + +func (e originalDstError) Error() string { + return fmt.Sprintf("errno (%d) when calling getsockopt(SO_ORIGINAL_DST): %v", int(e.errno), e.errno.Error()) +} + +// SO_ORIGINAL_DST gets the original destination of a redirected packet via +// getsockopt. +const SO_ORIGINAL_DST = 80 + +func originalDestination4(connfd int) (syscall.RawSockaddrInet4, error) { + var addr syscall.RawSockaddrInet4 + var addrLen uint32 = syscall.SizeofSockaddrInet4 + if errno := originalDestination(connfd, syscall.SOL_IP, unsafe.Pointer(&addr), &addrLen); errno != 0 { + return syscall.RawSockaddrInet4{}, originalDstError{errno} + } + return addr, nil +} + +func originalDestination6(connfd int) (syscall.RawSockaddrInet6, error) { + var addr syscall.RawSockaddrInet6 + var addrLen uint32 = syscall.SizeofSockaddrInet6 + if errno := originalDestination(connfd, syscall.SOL_IPV6, unsafe.Pointer(&addr), &addrLen); errno != 0 { + return syscall.RawSockaddrInet6{}, originalDstError{errno} + } + return addr, nil +} + +func originalDestination(connfd int, level uintptr, optval unsafe.Pointer, optlen *uint32) syscall.Errno { + _, _, errno := syscall.Syscall6( + syscall.SYS_GETSOCKOPT, + uintptr(connfd), + level, + SO_ORIGINAL_DST, + uintptr(optval), + uintptr(unsafe.Pointer(optlen)), + 0) + return errno +} diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go index ca80a4b5f..5125fe47b 100644 --- a/test/iptables/iptables_util.go +++ b/test/iptables/iptables_util.go @@ -15,6 +15,8 @@ package iptables import ( + "encoding/binary" + "errors" "fmt" "net" "os/exec" @@ -218,17 +220,58 @@ func filterAddrs(addrs []string, ipv6 bool) []string { // getInterfaceName returns the name of the interface other than loopback. func getInterfaceName() (string, bool) { - var ifname string + iface, ok := getNonLoopbackInterface() + if !ok { + return "", false + } + return iface.Name, true +} + +func getInterfaceAddrs(ipv6 bool) ([]net.IP, error) { + iface, ok := getNonLoopbackInterface() + if !ok { + return nil, errors.New("no non-loopback interface found") + } + addrs, err := iface.Addrs() + if err != nil { + return nil, err + } + + // Get only IPv4 or IPv6 addresses. + ips := make([]net.IP, 0, len(addrs)) + for _, addr := range addrs { + parts := strings.Split(addr.String(), "/") + var ip net.IP + // To16() returns IPv4 addresses as IPv4-mapped IPv6 addresses. + // So we check whether To4() returns nil to test whether the + // address is v4 or v6. + if v4 := net.ParseIP(parts[0]).To4(); ipv6 && v4 == nil { + ip = net.ParseIP(parts[0]).To16() + } else { + ip = v4 + } + if ip != nil { + ips = append(ips, ip) + } + } + return ips, nil +} + +func getNonLoopbackInterface() (net.Interface, bool) { if interfaces, err := net.Interfaces(); err == nil { for _, intf := range interfaces { if intf.Name != "lo" { - ifname = intf.Name - break + return intf, true } } } + return net.Interface{}, false +} - return ifname, ifname != "" +func htons(x uint16) uint16 { + buf := make([]byte, 2) + binary.BigEndian.PutUint16(buf, x) + return binary.LittleEndian.Uint16(buf) } func localIP(ipv6 bool) string { diff --git a/test/iptables/nat.go b/test/iptables/nat.go index ac0d91bb2..b7fea2527 100644 --- a/test/iptables/nat.go +++ b/test/iptables/nat.go @@ -18,12 +18,11 @@ import ( "errors" "fmt" "net" + "syscall" "time" ) -const ( - redirectPort = 42 -) +const redirectPort = 42 func init() { RegisterTestCase(NATPreRedirectUDPPort{}) @@ -42,6 +41,8 @@ func init() { RegisterTestCase(NATOutRedirectInvert{}) RegisterTestCase(NATRedirectRequiresProtocol{}) RegisterTestCase(NATLoopbackSkipsPrerouting{}) + RegisterTestCase(NATPreOriginalDst{}) + RegisterTestCase(NATOutOriginalDst{}) } // NATPreRedirectUDPPort tests that packets are redirected to different port. @@ -471,6 +472,151 @@ func (NATLoopbackSkipsPrerouting) LocalAction(ip net.IP, ipv6 bool) error { return nil } +// NATPreOriginalDst tests that SO_ORIGINAL_DST returns the pre-NAT destination +// of PREROUTING NATted packets. +type NATPreOriginalDst struct{} + +// Name implements TestCase.Name. +func (NATPreOriginalDst) Name() string { + return "NATPreOriginalDst" +} + +// ContainerAction implements TestCase.ContainerAction. +func (NATPreOriginalDst) ContainerAction(ip net.IP, ipv6 bool) error { + // Redirect incoming TCP connections to acceptPort. + if err := natTable(ipv6, "-A", "PREROUTING", + "-p", "tcp", + "--destination-port", fmt.Sprintf("%d", dropPort), + "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", acceptPort)); err != nil { + return err + } + + addrs, err := getInterfaceAddrs(ipv6) + if err != nil { + return err + } + return listenForRedirectedConn(ipv6, addrs) +} + +// LocalAction implements TestCase.LocalAction. +func (NATPreOriginalDst) LocalAction(ip net.IP, ipv6 bool) error { + return connectTCP(ip, dropPort, sendloopDuration) +} + +// NATOutOriginalDst tests that SO_ORIGINAL_DST returns the pre-NAT destination +// of OUTBOUND NATted packets. +type NATOutOriginalDst struct{} + +// Name implements TestCase.Name. +func (NATOutOriginalDst) Name() string { + return "NATOutOriginalDst" +} + +// ContainerAction implements TestCase.ContainerAction. +func (NATOutOriginalDst) ContainerAction(ip net.IP, ipv6 bool) error { + // Redirect incoming TCP connections to acceptPort. + if err := natTable(ipv6, "-A", "OUTPUT", "-p", "tcp", "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", acceptPort)); err != nil { + return err + } + + connCh := make(chan error) + go func() { + connCh <- connectTCP(ip, dropPort, sendloopDuration) + }() + + if err := listenForRedirectedConn(ipv6, []net.IP{ip}); err != nil { + return err + } + return <-connCh +} + +// LocalAction implements TestCase.LocalAction. +func (NATOutOriginalDst) LocalAction(ip net.IP, ipv6 bool) error { + // No-op. + return nil +} + +func listenForRedirectedConn(ipv6 bool, originalDsts []net.IP) error { + // The net package doesn't give guarantee access to the connection's + // underlying FD, and thus we cannot call getsockopt. We have to use + // traditional syscalls for SO_ORIGINAL_DST. + + // Create the listening socket, bind, listen, and accept. + family := syscall.AF_INET + if ipv6 { + family = syscall.AF_INET6 + } + sockfd, err := syscall.Socket(family, syscall.SOCK_STREAM, 0) + if err != nil { + return err + } + defer syscall.Close(sockfd) + + var bindAddr syscall.Sockaddr + if ipv6 { + bindAddr = &syscall.SockaddrInet6{ + Port: acceptPort, + Addr: [16]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // in6addr_any + } + } else { + bindAddr = &syscall.SockaddrInet4{ + Port: acceptPort, + Addr: [4]byte{0, 0, 0, 0}, // INADDR_ANY + } + } + if err := syscall.Bind(sockfd, bindAddr); err != nil { + return err + } + + if err := syscall.Listen(sockfd, 1); err != nil { + return err + } + + connfd, _, err := syscall.Accept(sockfd) + if err != nil { + return err + } + defer syscall.Close(connfd) + + // Verify that, despite listening on acceptPort, SO_ORIGINAL_DST + // indicates the packet was sent to originalDst:dropPort. + if ipv6 { + got, err := originalDestination6(connfd) + if err != nil { + return err + } + // The original destination could be any of our IPs. + for _, dst := range originalDsts { + want := syscall.RawSockaddrInet6{ + Family: syscall.AF_INET6, + Port: htons(dropPort), + } + copy(want.Addr[:], dst.To16()) + if got == want { + return nil + } + } + return fmt.Errorf("SO_ORIGINAL_DST returned %+v, but wanted one of %+v (note: port numbers are in network byte order)", got, originalDsts) + } else { + got, err := originalDestination4(connfd) + if err != nil { + return err + } + // The original destination could be any of our IPs. + for _, dst := range originalDsts { + want := syscall.RawSockaddrInet4{ + Family: syscall.AF_INET, + Port: htons(dropPort), + } + copy(want.Addr[:], dst.To4()) + if got == want { + return nil + } + } + return fmt.Errorf("SO_ORIGINAL_DST returned %+v, but wanted one of %+v (note: port numbers are in network byte order)", got, originalDsts) + } +} + // loopbackTests runs an iptables rule and ensures that packets sent to // dest:dropPort are received by localhost:acceptPort. func loopbackTest(ipv6 bool, dest net.IP, args ...string) error { diff --git a/test/packetdrill/BUILD b/test/packetdrill/BUILD index dfcd55f60..49642f282 100644 --- a/test/packetdrill/BUILD +++ b/test/packetdrill/BUILD @@ -1,4 +1,5 @@ -load("defs.bzl", "packetdrill_test") +load("//tools:defs.bzl", "bzl_library") +load("//test/packetdrill:defs.bzl", "packetdrill_test") package(licenses = ["notice"]) @@ -36,3 +37,9 @@ packetdrill_test( name = "tcp_defer_accept_timeout_test", scripts = ["tcp_defer_accept_timeout.pkt"], ) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/test/packetimpact/runner/BUILD b/test/packetimpact/runner/BUILD index bad4f0183..ff2be9b30 100644 --- a/test/packetimpact/runner/BUILD +++ b/test/packetimpact/runner/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_test") +load("//tools:defs.bzl", "bzl_library", "go_test") package( default_visibility = ["//test/packetimpact:__subpackages__"], @@ -19,3 +19,9 @@ go_test( "@com_github_docker_docker//api/types/mount:go_default_library", ], ) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl index 79b3c9162..93a36c6c2 100644 --- a/test/packetimpact/runner/defs.bzl +++ b/test/packetimpact/runner/defs.bzl @@ -61,12 +61,12 @@ PACKETIMPACT_TAGS = [ "packetimpact", ] -def packetimpact_linux_test( +def packetimpact_native_test( name, testbench_binary, expect_failure = False, **kwargs): - """Add a packetimpact test on linux. + """Add a native packetimpact test. Args: name: name of the test @@ -76,9 +76,9 @@ def packetimpact_linux_test( """ expect_failure_flag = ["--expect_failure"] if expect_failure else [] _packetimpact_test( - name = name + "_linux_test", + name = name + "_native_test", testbench_binary = testbench_binary, - flags = ["--dut_platform", "linux"] + expect_failure_flag, + flags = ["--native"] + expect_failure_flag, tags = PACKETIMPACT_TAGS, **kwargs ) @@ -102,21 +102,21 @@ def packetimpact_netstack_test( _packetimpact_test( name = name + "_netstack_test", testbench_binary = testbench_binary, - # This is the default runtime unless - # "--test_arg=--runtime=OTHER_RUNTIME" is used to override the value. - flags = ["--dut_platform", "netstack", "--runtime=runsc-d"] + expect_failure_flag, + # Note that a distinct runtime must be provided in the form + # --test_arg=--runtime=other when invoking bazel. + flags = expect_failure_flag, tags = PACKETIMPACT_TAGS, **kwargs ) -def packetimpact_go_test(name, size = "small", pure = True, expect_linux_failure = False, expect_netstack_failure = False, **kwargs): +def packetimpact_go_test(name, size = "small", pure = True, expect_native_failure = False, expect_netstack_failure = False, **kwargs): """Add packetimpact tests written in go. Args: name: name of the test size: size of the test pure: make a static go binary - expect_linux_failure: the test must fail for Linux + expect_native_failure: the test must fail natively expect_netstack_failure: the test must fail for Netstack **kwargs: all the other args, forwarded to go_test """ @@ -131,9 +131,9 @@ def packetimpact_go_test(name, size = "small", pure = True, expect_linux_failure ], **kwargs ) - packetimpact_linux_test( + packetimpact_native_test( name = name, - expect_failure = expect_linux_failure, + expect_failure = expect_native_failure, testbench_binary = testbench_binary, ) packetimpact_netstack_test( diff --git a/test/packetimpact/runner/packetimpact_test.go b/test/packetimpact/runner/packetimpact_test.go index 74e1e6def..e8c183977 100644 --- a/test/packetimpact/runner/packetimpact_test.go +++ b/test/packetimpact/runner/packetimpact_test.go @@ -50,7 +50,7 @@ func (l *stringList) Set(value string) error { } var ( - dutPlatform = flag.String("dut_platform", "", "either \"linux\" or \"netstack\"") + native = flag.Bool("native", false, "whether the test should be run natively") testbenchBinary = flag.String("testbench_binary", "", "path to the testbench binary") tshark = flag.Bool("tshark", false, "use more verbose tshark in logs instead of tcpdump") extraTestArgs = stringList{} @@ -84,17 +84,9 @@ func (l logger) Logf(format string, args ...interface{}) { func TestOne(t *testing.T) { flag.Var(&extraTestArgs, "extra_test_arg", "extra arguments to pass to the testbench") flag.Parse() - if *dutPlatform != "linux" && *dutPlatform != "netstack" { - t.Fatal("--dut_platform should be either linux or netstack") - } if *testbenchBinary == "" { t.Fatal("--testbench_binary is missing") } - if *dutPlatform == "netstack" { - if _, err := dockerutil.RuntimePath(); err != nil { - t.Fatal("--runtime is missing or invalid with --dut_platform=netstack:", err) - } - } dockerutil.EnsureSupportedDockerVersion() ctx := context.Background() @@ -140,9 +132,11 @@ func TestOne(t *testing.T) { const testOutputDir = "/tmp/testoutput" // Create the Docker container for the DUT. - dut := dockerutil.MakeContainer(ctx, logger("dut")) - if *dutPlatform == "linux" { + var dut *dockerutil.Container + if *native { dut = dockerutil.MakeNativeContainer(ctx, logger("dut")) + } else { + dut = dockerutil.MakeContainer(ctx, logger("dut")) } runOpts := dockerutil.RunOpts{ @@ -307,7 +301,7 @@ func TestOne(t *testing.T) { "--remote_mac", remoteMAC.String(), "--remote_interface_id", fmt.Sprintf("%d", dutDeviceInfo.ID), "--device", testNetDev, - "--dut_type", *dutPlatform, + fmt.Sprintf("--native=%t", *native), ) testbenchLogs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, testArgs...) if (err != nil) != *expectFailure { diff --git a/test/packetimpact/testbench/testbench.go b/test/packetimpact/testbench/testbench.go index 242464e3a..e3629e1f3 100644 --- a/test/packetimpact/testbench/testbench.go +++ b/test/packetimpact/testbench/testbench.go @@ -27,8 +27,8 @@ import ( ) var ( - // DUTType is the type of device under test. - DUTType = "" + // Native indicates that the test is being run natively. + Native = false // Device is the local device on the test network. Device = "" @@ -81,7 +81,7 @@ func RegisterFlags(fs *flag.FlagSet) { fs.StringVar(&RemoteIPv6, "remote_ipv6", RemoteIPv6, "remote IPv6 address for test packets") fs.StringVar(&RemoteMAC, "remote_mac", RemoteMAC, "remote mac address for test packets") fs.StringVar(&Device, "device", Device, "local device for test packets") - fs.StringVar(&DUTType, "dut_type", DUTType, "type of device under test") + fs.BoolVar(&Native, "native", Native, "whether the test is running natively") fs.Uint64Var(&RemoteInterfaceID, "remote_interface_id", RemoteInterfaceID, "remote interface ID for test packets") } diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD index 0c2a05380..74658fea0 100644 --- a/test/packetimpact/tests/BUILD +++ b/test/packetimpact/tests/BUILD @@ -40,8 +40,6 @@ packetimpact_go_test( packetimpact_go_test( name = "udp_recv_mcast_bcast", srcs = ["udp_recv_mcast_bcast_test.go"], - # TODO(b/152813495): Fix netstack then remove the line below. - expect_netstack_failure = True, deps = [ "//pkg/tcpip", "//pkg/tcpip/header", diff --git a/test/packetimpact/tests/tcp_reordering_test.go b/test/packetimpact/tests/tcp_reordering_test.go index 8742819ca..b4aeaab57 100644 --- a/test/packetimpact/tests/tcp_reordering_test.go +++ b/test/packetimpact/tests/tcp_reordering_test.go @@ -54,13 +54,13 @@ func TestReorderingWindow(t *testing.T) { acceptFd, _ := dut.Accept(t, listenFd) defer dut.Close(t, acceptFd) - if tb.DUTType == "linux" { + if tb.Native { // Linux has changed its handling of reordering, force the old behavior. dut.SetSockOpt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_CONGESTION, []byte("reno")) } pls := dut.GetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_MAXSEG) - if tb.DUTType == "netstack" { + if !tb.Native { // netstack does not impliment TCP_MAXSEG correctly. Fake it // here. Netstack uses the max SACK size which is 32. The MSS // option is 8 bytes, making the total 36 bytes. @@ -141,7 +141,7 @@ func TestReorderingWindow(t *testing.T) { } } - if tb.DUTType == "netstack" { + if !tb.Native { // The window should now be halved, so we should receive any // more, even if we send them. dut.Send(t, acceptFd, payload, 0) diff --git a/test/runner/BUILD b/test/runner/BUILD index 63c7ec83a..582d2946d 100644 --- a/test/runner/BUILD +++ b/test/runner/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_binary") +load("//tools:defs.bzl", "bzl_library", "go_binary") package(licenses = ["notice"]) @@ -21,3 +21,9 @@ go_binary( "@org_golang_x_sys//unix:go_default_library", ], ) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl index c92392b35..ba4732ca6 100644 --- a/test/runner/defs.bzl +++ b/test/runner/defs.bzl @@ -201,7 +201,7 @@ def syscall_test( platform = default_platform, use_tmpfs = use_tmpfs, add_uds_tree = add_uds_tree, - tags = platforms[default_platform] + vfs2_tags, + tags = platforms[default_platform] + vfs2_tags + ["fuse"], vfs2 = True, fuse = True, ) diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD index 1728161ce..066338ee3 100644 --- a/test/runtimes/BUILD +++ b/test/runtimes/BUILD @@ -1,3 +1,4 @@ +load("//tools:defs.bzl", "bzl_library") load("//test/runtimes:defs.bzl", "runtime_test") package(licenses = ["notice"]) @@ -6,33 +7,40 @@ runtime_test( name = "go1.12", exclude_file = "exclude_go1.12.csv", lang = "go", - shard_count = 5, + shard_count = 8, ) runtime_test( name = "java11", + batch = 100, exclude_file = "exclude_java11.csv", lang = "java", - shard_count = 20, + shard_count = 16, ) runtime_test( name = "nodejs12.4.0", exclude_file = "exclude_nodejs12.4.0.csv", lang = "nodejs", - shard_count = 10, + shard_count = 8, ) runtime_test( name = "php7.3.6", exclude_file = "exclude_php7.3.6.csv", lang = "php", - shard_count = 5, + shard_count = 8, ) runtime_test( name = "python3.7.3", exclude_file = "exclude_python3.7.3.csv", lang = "python", - shard_count = 5, + shard_count = 8, +) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], ) diff --git a/test/runtimes/defs.bzl b/test/runtimes/defs.bzl index 5779b9591..702522d86 100644 --- a/test/runtimes/defs.bzl +++ b/test/runtimes/defs.bzl @@ -9,6 +9,8 @@ def _runtime_test_impl(ctx): ctx.attr.lang, "--image", ctx.attr.image, + "--batch", + str(ctx.attr.batch), ] if ctx.attr.exclude_file: args += [ @@ -47,11 +49,19 @@ _runtime_test = rule( mandatory = False, allow_single_file = True, ), + "batch": attr.int( + default = 50, + mandatory = False, + ), "_runner": attr.label( default = "//test/runtimes/runner:runner", + executable = True, + cfg = "target", ), "_proctor": attr.label( default = "//test/runtimes/proctor:proctor", + executable = True, + cfg = "target", ), }, test = True, diff --git a/test/runtimes/exclude_go1.12.csv b/test/runtimes/exclude_go1.12.csv index 8c8ae0c5d..81e02cf64 100644 --- a/test/runtimes/exclude_go1.12.csv +++ b/test/runtimes/exclude_go1.12.csv @@ -2,15 +2,12 @@ test name,bug id,comment cgo_errors,,FLAKY cgo_test,,FLAKY go_test:cmd/go,,FLAKY -go_test:cmd/vendor/golang.org/x/sys/unix,b/118783622,/dev devices missing -go_test:net,b/118784196,socket: invalid argument. Works as intended: see bug. +go_test:net,b/162473575,setsockopt: protocol not available. go_test:os,b/118780122,we have a pollable filesystem but that's a surprise -go_test:os/signal,b/118780860,/dev/pts not properly supported -go_test:runtime,b/118782341,sigtrap not reported or caught or something -go_test:syscall,b/118781998,bad bytes -- bad mem addr -race,b/118782931,thread sanitizer. Works as intended: b/62219744. +go_test:os/signal,b/118780860,/dev/pts not properly supported. Also being tracked in b/29356795. +go_test:runtime,b/118782341,sigtrap not reported or caught or something. Also being tracked in b/33003106. +go_test:syscall,b/118781998,bad bytes -- bad mem addr; FcntlFlock(F_GETLK) not supported. runtime:cpu124,b/118778254,segmentation fault test:0_1,,FLAKY -testasan,, testcarchive,b/118782924,no sigpipe testshared,,FLAKY diff --git a/test/runtimes/exclude_java11.csv b/test/runtimes/exclude_java11.csv index c012e5a56..997a29cad 100644 --- a/test/runtimes/exclude_java11.csv +++ b/test/runtimes/exclude_java11.csv @@ -15,12 +15,19 @@ java/lang/Character/CheckScript.java,,Fails in Docker java/lang/Character/CheckUnicode.java,,Fails in Docker java/lang/Class/GetPackageBootLoaderChildLayer.java,, java/lang/ClassLoader/nativeLibrary/NativeLibraryTest.java,,Fails in Docker +java/lang/module/ModuleDescriptorTest.java,, java/lang/String/nativeEncoding/StringPlatformChars.java,, -java/net/DatagramSocket/ReuseAddressTest.java,, -java/net/DatagramSocket/SendDatagramToBadAddress.java,b/78473345, -java/net/Inet4Address/PingThis.java,, +java/net/CookieHandler/B6791927.java,,java.lang.RuntimeException: Expiration date shouldn't be 0 +java/net/ipv6tests/TcpTest.java,,java.net.ConnectException: Connection timed out (Connection timed out) +java/net/ipv6tests/UdpTest.java,,Times out +java/net/Inet6Address/B6558853.java,,Times out +java/net/InetAddress/CheckJNI.java,,java.net.ConnectException: Connection timed out (Connection timed out) java/net/InterfaceAddress/NetworkPrefixLength.java,b/78507103, +java/net/MulticastSocket/B6425815.java,,java.net.SocketException: Protocol not available (Error getting socket option) +java/net/MulticastSocket/B6427403.java,,java.net.SocketException: Protocol not available java/net/MulticastSocket/MulticastTTL.java,, +java/net/MulticastSocket/NetworkInterfaceEmptyGetInetAddressesTest.java,,java.net.SocketException: Protocol not available (Error getting socket option) +java/net/MulticastSocket/NoLoopbackPackets.java,,java.net.SocketException: Protocol not available java/net/MulticastSocket/Promiscuous.java,, java/net/MulticastSocket/SetLoopbackMode.java,, java/net/MulticastSocket/SetTTLAndGetTTL.java,, @@ -28,24 +35,18 @@ java/net/MulticastSocket/Test.java,, java/net/MulticastSocket/TestDefaults.java,, java/net/MulticastSocket/TimeToLive.java,, java/net/NetworkInterface/NetworkInterfaceStreamTest.java,, +java/net/Socket/LinkLocal.java,,java.net.SocketTimeoutException: Receive timed out java/net/Socket/SetSoLinger.java,b/78527327,SO_LINGER is not yet supported -java/net/Socket/TrafficClass.java,b/78527818,Not supported on gVisor java/net/Socket/UrgentDataTest.java,b/111515323, -java/net/Socket/setReuseAddress/Basic.java,b/78519214,SO_REUSEADDR enabled by default java/net/SocketOption/OptionsTest.java,,Fails in Docker -java/net/SocketOption/TcpKeepAliveTest.java,, java/net/SocketPermission/SocketPermissionTest.java,, java/net/URLConnection/6212146/TestDriver.java,,Fails in Docker java/net/httpclient/RequestBuilderTest.java,,Fails in Docker -java/net/httpclient/ShortResponseBody.java,, -java/net/httpclient/ShortResponseBodyWithRetry.java,, -java/nio/channels/AsyncCloseAndInterrupt.java,, -java/nio/channels/AsynchronousServerSocketChannel/Basic.java,, -java/nio/channels/AsynchronousSocketChannel/Basic.java,b/77921528,SO_KEEPALIVE is not settable java/nio/channels/DatagramChannel/BasicMulticastTests.java,, -java/nio/channels/DatagramChannel/SocketOptionTests.java,,Fails in Docker +java/nio/channels/DatagramChannel/SocketOptionTests.java,,java.net.SocketException: Invalid argument java/nio/channels/DatagramChannel/UseDGWithIPv6.java,, java/nio/channels/FileChannel/directio/DirectIOTest.java,,Fails in Docker +java/nio/channels/FileChannel/directio/PwriteDirect.java,,java.io.IOException: Invalid argument java/nio/channels/Selector/OutOfBand.java,, java/nio/channels/Selector/SelectWithConsumer.java,,Flaky java/nio/channels/ServerSocketChannel/SocketOptionTests.java,, @@ -59,15 +60,89 @@ java/text/Format/NumberFormat/CurrencyFormat.java,,Fails in Docker java/util/Calendar/JapaneseEraNameTest.java,, java/util/Currency/CurrencyTest.java,,Fails in Docker java/util/Currency/ValidateISO4217.java,,Fails in Docker +java/util/EnumSet/BogusEnumSet.java,,"java.io.InvalidClassException: java.util.EnumSet; local class incompatible: stream classdesc serialVersionUID = -2409567991088730183, local class serialVersionUID = 1009687484059888093" +java/util/Locale/Bug8040211.java,,java.lang.RuntimeException: Failed. java/util/Locale/LSRDataTest.java,, +java/util/Properties/CompatibilityTest.java,,"java.lang.RuntimeException: jdk.internal.org.xml.sax.SAXParseException; Internal DTD subset is not allowed. The Properties XML document must have the following DOCTYPE declaration: <!DOCTYPE properties SYSTEM ""http://java.sun.com/dtd/properties.dtd"">" +java/util/ResourceBundle/Control/XMLResourceBundleTest.java,,java.util.MissingResourceException: Can't find bundle for base name XmlRB locale +java/util/ResourceBundle/modules/xmlformat/xmlformat.sh,,Timeout reached: 60000. Process is not alive! +java/util/TimeZone/TimeZoneTest.java,,Uncaught exception thrown in test method TestShortZoneIDs java/util/concurrent/locks/Lock/TimedAcquireLeak.java,, java/util/jar/JarFile/mrjar/MultiReleaseJarAPI.java,,Fails in Docker java/util/logging/LogManager/Configuration/updateConfiguration/SimpleUpdateConfigWithInputStreamTest.java,, java/util/logging/TestLoggerWeakRefLeak.java,, +java/util/spi/ResourceBundleControlProvider/UserDefaultControlTest.java,,java.util.MissingResourceException: Can't find bundle for base name com.foo.XmlRB locale javax/imageio/AppletResourceTest.java,, +javax/imageio/plugins/jpeg/JPEGsNotAcceleratedTest.java,,java.awt.HeadlessException: No X11 DISPLAY variable was set but this program performed an operation which requires it. javax/management/security/HashedPasswordFileTest.java,, +javax/net/ssl/DTLS/DTLSBufferOverflowUnderflowTest.java,,Compilation failed +javax/net/ssl/DTLS/DTLSDataExchangeTest.java,,Compilation failed +javax/net/ssl/DTLS/DTLSEnginesClosureTest.java,,Compilation failed +javax/net/ssl/DTLS/DTLSHandshakeTest.java,,Compilation failed +javax/net/ssl/DTLS/DTLSHandshakeWithReplicatedPacketsTest.java,,Compilation failed +javax/net/ssl/DTLS/DTLSIncorrectAppDataTest.java,,Compilation failed +javax/net/ssl/DTLS/DTLSMFLNTest.java,,Compilation failed +javax/net/ssl/DTLS/DTLSNotEnabledRC4Test.java,,Compilation failed +javax/net/ssl/DTLS/DTLSRehandshakeTest.java,,Compilation failed +javax/net/ssl/DTLS/DTLSRehandshakeWithCipherChangeTest.java,,Compilation failed +javax/net/ssl/DTLS/DTLSRehandshakeWithDataExTest.java,,Compilation failed +javax/net/ssl/DTLS/DTLSSequenceNumberTest.java,,Compilation failed +javax/net/ssl/DTLS/DTLSUnsupportedCiphersTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10BufferOverflowUnderflowTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10DataExchangeTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10EnginesClosureTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10HandshakeTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10HandshakeWithReplicatedPacketsTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10IncorrectAppDataTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10MFLNTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10NotEnabledRC4Test.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10RehandshakeTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10RehandshakeWithCipherChangeTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10RehandshakeWithDataExTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10SequenceNumberTest.java,,Compilation failed +javax/net/ssl/DTLSv10/DTLSv10UnsupportedCiphersTest.java,,Compilation failed javax/net/ssl/SSLSession/JSSERenegotiate.java,,Fails in Docker +javax/net/ssl/TLS/TLSDataExchangeTest.java,,Compilation failed +javax/net/ssl/TLS/TLSEnginesClosureTest.java,,Compilation failed +javax/net/ssl/TLS/TLSHandshakeTest.java,,Compilation failed +javax/net/ssl/TLS/TLSMFLNTest.java,,Compilation failed +javax/net/ssl/TLS/TLSNotEnabledRC4Test.java,,Compilation failed +javax/net/ssl/TLS/TLSRehandshakeTest.java,,Compilation failed +javax/net/ssl/TLS/TLSRehandshakeWithCipherChangeTest.java,,Compilation failed +javax/net/ssl/TLS/TLSRehandshakeWithDataExTest.java,,Compilation failed +javax/net/ssl/TLS/TLSUnsupportedCiphersTest.java,,Compilation failed +javax/net/ssl/TLSv1/TLSDataExchangeTest.java,,Compilation failed +javax/net/ssl/TLSv1/TLSEnginesClosureTest.java,,Compilation failed +javax/net/ssl/TLSv1/TLSHandshakeTest.java,,Compilation failed +javax/net/ssl/TLSv1/TLSMFLNTest.java,,Compilation failed +javax/net/ssl/TLSv1/TLSNotEnabledRC4Test.java,,Compilation failed +javax/net/ssl/TLSv1/TLSRehandshakeTest.java,,Compilation failed +javax/net/ssl/TLSv1/TLSRehandshakeWithCipherChangeTest.java,,Compilation failed +javax/net/ssl/TLSv1/TLSRehandshakeWithDataExTest.java,,Compilation failed +javax/net/ssl/TLSv1/TLSUnsupportedCiphersTest.java,,Compilation failed +javax/net/ssl/TLSv11/TLSDataExchangeTest.java,,Compilation failed +javax/net/ssl/TLSv11/TLSEnginesClosureTest.java,,Compilation failed +javax/net/ssl/TLSv11/TLSHandshakeTest.java,,Compilation failed +javax/net/ssl/TLSv11/TLSMFLNTest.java,,Compilation failed +javax/net/ssl/TLSv11/TLSNotEnabledRC4Test.java,,Compilation failed +javax/net/ssl/TLSv11/TLSRehandshakeTest.java,,Compilation failed +javax/net/ssl/TLSv11/TLSRehandshakeWithCipherChangeTest.java,,Compilation failed +javax/net/ssl/TLSv11/TLSRehandshakeWithDataExTest.java,,Compilation failed +javax/net/ssl/TLSv11/TLSUnsupportedCiphersTest.java,,Compilation failed +javax/net/ssl/TLSv12/TLSEnginesClosureTest.java,,Compilation failed javax/sound/sampled/AudioInputStream/FrameLengthAfterConversion.java,, +jdk/jfr/cmd/TestHelp.java,,java.lang.RuntimeException: 'Available commands are:' missing from stdout/stderr +jdk/jfr/cmd/TestPrint.java,,Missing file' missing from stdout/stderr +jdk/jfr/cmd/TestPrintDefault.java,,java.lang.RuntimeException: 'JVMInformation' missing from stdout/stderr +jdk/jfr/cmd/TestPrintJSON.java,,javax.script.ScriptException: <eval>:1:17 Expected an operand but found eof var jsonObject = ^ in <eval> at line number 1 at column number 17 +jdk/jfr/cmd/TestPrintXML.java,,org.xml.sax.SAXParseException; lineNumber: 1; columnNumber: 1; Premature end of file. +jdk/jfr/cmd/TestReconstruct.java,,java.lang.RuntimeException: 'Too few arguments' missing from stdout/stderr +jdk/jfr/cmd/TestSplit.java,,java.lang.RuntimeException: 'Missing file' missing from stdout/stderr +jdk/jfr/cmd/TestSummary.java,,java.lang.RuntimeException: 'Missing file' missing from stdout/stderr +jdk/jfr/event/compiler/TestCompilerStats.java,,java.lang.RuntimeException: Field nmetodsSize not in event +jdk/jfr/event/metadata/TestDefaultConfigurations.java,,Setting 'threshold' in event 'jdk.SecurityPropertyModification' was not configured in the configuration 'default' +jdk/jfr/event/runtime/TestActiveSettingEvent.java,,java.lang.Exception: Could not find setting with name jdk.X509Validation#threshold +jdk/jfr/event/runtime/TestModuleEvents.java,,java.lang.RuntimeException: assertEquals: expected jdk.proxy1 to equal java.base jdk/jfr/event/runtime/TestNetworkUtilizationEvent.java,, jdk/jfr/event/runtime/TestThreadParkEvent.java,, jdk/jfr/event/sampling/TestNative.java,, @@ -86,6 +161,7 @@ jdk/jfr/jcmd/TestJcmdStartStopDefault.java,, jdk/jfr/jcmd/TestJcmdStartWithOptions.java,, jdk/jfr/jcmd/TestJcmdStartWithSettings.java,, jdk/jfr/jcmd/TestJcmdStopInvalidFile.java,, +jdk/jfr/jvm/TestGetAllEventClasses.java,,Compilation failed jdk/jfr/jvm/TestJfrJavaBase.java,, jdk/jfr/startupargs/TestStartRecording.java,, jdk/modules/incubator/ImageModules.java,, @@ -100,6 +176,10 @@ sun/management/jmxremote/bootstrap/RmiSslBootstrapTest.sh,, sun/management/jmxremote/startstop/JMXStartStopTest.java,, sun/management/jmxremote/startstop/JMXStatusPerfCountersTest.java,, sun/management/jmxremote/startstop/JMXStatusTest.java,, +sun/management/jdp/JdpDefaultsTest.java,, +sun/management/jdp/JdpJmxRemoteDynamicPortTest.java,, +sun/management/jdp/JdpOffTest.java,, +sun/management/jdp/JdpSpecificAddressTest.java,, sun/text/resources/LocaleDataTest.java,, sun/tools/jcmd/TestJcmdSanity.java,, sun/tools/jhsdb/AlternateHashingTest.java,, @@ -123,4 +203,6 @@ tools/jlink/JLinkTest.java,, tools/jlink/plugins/IncludeLocalesPluginTest.java,, tools/jmod/hashes/HashesTest.java,, tools/launcher/BigJar.java,b/111611473, +tools/launcher/HelpFlagsTest.java,,java.lang.AssertionError: HelpFlagsTest failed: Tool jfr not covered by this test. Add specification to jdkTools array! +tools/launcher/VersionCheck.java,,java.lang.AssertionError: VersionCheck failed: testToolVersion: [jfr]; tools/launcher/modules/patch/systemmodules/PatchSystemModules.java,, diff --git a/test/runtimes/exclude_nodejs12.4.0.csv b/test/runtimes/exclude_nodejs12.4.0.csv index 4eb0a4807..1d8e65fd0 100644 --- a/test/runtimes/exclude_nodejs12.4.0.csv +++ b/test/runtimes/exclude_nodejs12.4.0.csv @@ -1,51 +1,55 @@ test name,bug id,comment benchmark/test-benchmark-fs.js,, -benchmark/test-benchmark-module.js,, benchmark/test-benchmark-napi.js,, doctool/test-make-doc.js,b/68848110,Expected to fail. -fixtures/test-error-first-line-offset.js,, -fixtures/test-fs-readfile-error.js,, -fixtures/test-fs-stat-sync-overflow.js,, -internet/test-dgram-broadcast-multi-process.js,, -internet/test-dgram-multicast-multi-process.js,, -internet/test-dgram-multicast-set-interface-lo.js,, +internet/test-dgram-multicast-set-interface-lo.js,b/162798882, internet/test-doctool-versions.js,, internet/test-uv-threadpool-schedule.js,, parallel/test-cluster-dgram-reuse.js,b/64024294, parallel/test-dgram-bind-fd.js,b/132447356, -parallel/test-dgram-create-socket-handle-fd.js,b/132447238, -parallel/test-dgram-createSocket-type.js,b/68847739, parallel/test-dgram-socket-buffer-size.js,b/68847921, +parallel/test-dns-channel-timeout.js,b/161893056, parallel/test-fs-access.js,, -parallel/test-fs-write-stream-double-close.js,, +parallel/test-fs-watchfile.js,,Flaky - File already exists error +parallel/test-fs-write-stream.js,,Flaky parallel/test-fs-write-stream-throw-type-error.js,b/110226209, -parallel/test-fs-write-stream.js,, -parallel/test-http2-respond-file-error-pipe-offset.js,, -parallel/test-os.js,, +parallel/test-http-writable-true-after-close.js,,Flaky - Mismatched <anonymous> function calls. Expected exactly 1 actual 2 +parallel/test-os.js,b/63997097, +parallel/test-net-server-listen-options.js,,Flaky - EADDRINUSE parallel/test-process-uid-gid.js,, -pseudo-tty/test-assert-colors.js,, -pseudo-tty/test-assert-no-color.js,, -pseudo-tty/test-assert-position-indicator.js,, -pseudo-tty/test-async-wrap-getasyncid-tty.js,, -pseudo-tty/test-fatal-error.js,, -pseudo-tty/test-handle-wrap-isrefed-tty.js,, -pseudo-tty/test-readable-tty-keepalive.js,, -pseudo-tty/test-set-raw-mode-reset-process-exit.js,, -pseudo-tty/test-set-raw-mode-reset-signal.js,, -pseudo-tty/test-set-raw-mode-reset.js,, -pseudo-tty/test-stderr-stdout-handle-sigwinch.js,, -pseudo-tty/test-stdout-read.js,, -pseudo-tty/test-tty-color-support.js,, -pseudo-tty/test-tty-isatty.js,, -pseudo-tty/test-tty-stdin-call-end.js,, -pseudo-tty/test-tty-stdin-end.js,, -pseudo-tty/test-stdin-write.js,, -pseudo-tty/test-tty-stdout-end.js,, -pseudo-tty/test-tty-stdout-resize.js,, -pseudo-tty/test-tty-stream-constructors.js,, -pseudo-tty/test-tty-window-size.js,, -pseudo-tty/test-tty-wrap.js,, +parallel/test-tls-cli-min-version-1.0.js,,Flaky - EADDRINUSE +parallel/test-tls-cli-min-version-1.1.js,,Flaky - EADDRINUSE +parallel/test-tls-cli-min-version-1.2.js,,Flaky - EADDRINUSE +parallel/test-tls-cli-min-version-1.3.js,,Flaky - EADDRINUSE +parallel/test-tls-cli-max-version-1.2.js,,Flaky - EADDRINUSE +parallel/test-tls-cli-max-version-1.3.js,,Flaky - EADDRINUSE +parallel/test-tls-min-max-version.js,,Flaky - EADDRINUSE +pseudo-tty/test-assert-colors.js,b/162801321, +pseudo-tty/test-assert-no-color.js,b/162801321, +pseudo-tty/test-assert-position-indicator.js,b/162801321, +pseudo-tty/test-async-wrap-getasyncid-tty.js,b/162801321, +pseudo-tty/test-fatal-error.js,b/162801321, +pseudo-tty/test-handle-wrap-isrefed-tty.js,b/162801321, +pseudo-tty/test-readable-tty-keepalive.js,b/162801321, +pseudo-tty/test-set-raw-mode-reset-process-exit.js,b/162801321, +pseudo-tty/test-set-raw-mode-reset-signal.js,b/162801321, +pseudo-tty/test-set-raw-mode-reset.js,b/162801321, +pseudo-tty/test-stderr-stdout-handle-sigwinch.js,b/162801321, +pseudo-tty/test-stdout-read.js,b/162801321, +pseudo-tty/test-tty-color-support.js,b/162801321, +pseudo-tty/test-tty-isatty.js,b/162801321, +pseudo-tty/test-tty-stdin-call-end.js,b/162801321, +pseudo-tty/test-tty-stdin-end.js,b/162801321, +pseudo-tty/test-stdin-write.js,b/162801321, +pseudo-tty/test-tty-stdout-end.js,b/162801321, +pseudo-tty/test-tty-stdout-resize.js,b/162801321, +pseudo-tty/test-tty-stream-constructors.js,b/162801321, +pseudo-tty/test-tty-window-size.js,b/162801321, +pseudo-tty/test-tty-wrap.js,b/162801321, +pummel/test-heapdump-http2.js,,Flaky pummel/test-net-pingpong.js,, -pummel/test-vm-memleak.js,, +pummel/test-vm-memleak.js,b/162799436, +sequential/test-child-process-pass-fd.js,b/63926391,Flaky +sequential/test-https-connect-localport.js,,Flaky - EADDRINUSE sequential/test-net-bytes-per-incoming-chunk-overhead.js,,flaky - timeout tick-processor/test-tick-processor-builtin.js,, diff --git a/test/runtimes/exclude_php7.3.6.csv b/test/runtimes/exclude_php7.3.6.csv index 0bef786c0..2ce979dc8 100644 --- a/test/runtimes/exclude_php7.3.6.csv +++ b/test/runtimes/exclude_php7.3.6.csv @@ -8,23 +8,24 @@ ext/mbstring/tests/bug77165.phpt,, ext/mbstring/tests/bug77454.phpt,, ext/mbstring/tests/mb_convert_encoding_leak.phpt,, ext/mbstring/tests/mb_strrpos_encoding_3rd_param.phpt,, +ext/session/tests/session_module_name_variation4.phpt,,Flaky ext/session/tests/session_set_save_handler_class_018.phpt,, ext/session/tests/session_set_save_handler_iface_003.phpt,, +ext/session/tests/session_set_save_handler_sid_001.phpt,, ext/session/tests/session_set_save_handler_variation4.phpt,, -ext/session/tests/session_set_save_handler_variation5.phpt,, -ext/standard/tests/file/filetype_variation.phpt,, -ext/standard/tests/file/fopen_variation19.phpt,, +ext/standard/tests/file/fopen_variation19.phpt,b/162894964, +ext/standard/tests/file/lstat_stat_variation14.phpt,,Flaky ext/standard/tests/file/php_fd_wrapper_01.phpt,, ext/standard/tests/file/php_fd_wrapper_02.phpt,, ext/standard/tests/file/php_fd_wrapper_03.phpt,, ext/standard/tests/file/php_fd_wrapper_04.phpt,, -ext/standard/tests/file/realpath_bug77484.phpt,, +ext/standard/tests/file/realpath_bug77484.phpt,b/162894969, ext/standard/tests/file/rename_variation.phpt,b/68717309, -ext/standard/tests/file/symlink_link_linkinfo_is_link_variation4.phpt,, -ext/standard/tests/file/symlink_link_linkinfo_is_link_variation8.phpt,, +ext/standard/tests/file/symlink_link_linkinfo_is_link_variation4.phpt,b/162895341, +ext/standard/tests/file/symlink_link_linkinfo_is_link_variation8.phpt,b/162896223, ext/standard/tests/general_functions/escapeshellarg_bug71270.phpt,, ext/standard/tests/general_functions/escapeshellcmd_bug71270.phpt,, -ext/standard/tests/network/bug20134.phpt,, +ext/standard/tests/streams/proc_open_bug69900.phpt,,Flaky ext/standard/tests/streams/stream_socket_sendto.phpt,, ext/standard/tests/strings/007.phpt,, sapi/cli/tests/upload_2G.phpt,, @@ -34,4 +35,4 @@ tests/output/stream_isatty_in-out-err.phpt,, tests/output/stream_isatty_in-out.phpt,b/68720299, tests/output/stream_isatty_out-err.phpt,b/68720311, tests/output/stream_isatty_out.phpt,b/68720325, -Zend/tests/concat_003.phpt,, +Zend/tests/concat_003.phpt,b/162896021, diff --git a/test/runtimes/exclude_python3.7.3.csv b/test/runtimes/exclude_python3.7.3.csv index 2b9947212..8760f8951 100644 --- a/test/runtimes/exclude_python3.7.3.csv +++ b/test/runtimes/exclude_python3.7.3.csv @@ -1,27 +1,21 @@ test name,bug id,comment -test_asynchat,b/76031995,SO_REUSEADDR test_asyncio,,Fails on Docker. -test_asyncore,b/76031995,SO_REUSEADDR -test_epoll,, -test_fcntl,,fcntl invalid argument -- artificial test to make sure something works in 64 bit mode. -test_ftplib,,Fails in Docker -test_httplib,b/76031995,SO_REUSEADDR -test_imaplib,, -test_logging,, +test_asyncore,b/162973328, +test_epoll,b/162983393, +test_fcntl,b/162978767,fcntl invalid argument -- artificial test to make sure something works in 64 bit mode. +test_httplib,b/163000009,OSError: [Errno 98] Address already in use +test_imaplib,b/162979661, +test_logging,b/162980079, test_multiprocessing_fork,,Flaky. Sometimes times out. test_multiprocessing_forkserver,,Flaky. Sometimes times out. test_multiprocessing_main_handling,,Flaky. Sometimes times out. test_multiprocessing_spawn,,Flaky. Sometimes times out. -test_nntplib,b/76031995,tests should not set SO_REUSEADDR -test_poplib,,Fails on Docker test_posix,b/76174079,posix.sched_get_priority_min not implemented + posix.sched_rr_get_interval not permitted -test_pty,b/76157709,out of pty devices -test_readline,b/76157709,out of pty devices +test_pty,b/162979921, +test_readline,b/162980389,TestReadline hangs forever test_resource,b/76174079, test_selectors,b/76116849,OSError not raised with epoll -test_smtplib,b/76031995,SO_REUSEADDR and unclosed sockets +test_smtplib,b/162980434,unclosed sockets +test_signal,,Flaky - signal: alarm clock test_socket,b/75983380, -test_ssl,b/76031995,SO_REUSEADDR -test_subprocess,, -test_support,b/76031995,SO_REUSEADDR -test_telnetlib,b/76031995,SO_REUSEADDR +test_subprocess,b/162980831, diff --git a/test/runtimes/proctor/go.go b/test/runtimes/proctor/go.go index 073c2959d..d0ae844e6 100644 --- a/test/runtimes/proctor/go.go +++ b/test/runtimes/proctor/go.go @@ -81,13 +81,13 @@ func (goRunner) TestCmds(tests []string) []*exec.Cmd { if strings.HasSuffix(test, ".go") { onDiskTests = append(onDiskTests, test) } else { - toolTests = append(toolTests, test) + toolTests = append(toolTests, "^"+test+"$") } } var cmds []*exec.Cmd if len(toolTests) > 0 { - cmds = append(cmds, exec.Command("go", "tool", "dist", "test", "-run", strings.Join(toolTests, "\\|"))) + cmds = append(cmds, exec.Command("go", "tool", "dist", "test", "-v", "-no-rebuild", "-run", strings.Join(toolTests, "\\|"))) } if len(onDiskTests) > 0 { cmd := exec.Command("go", append([]string{"run", "run.go", "-v", "--"}, onDiskTests...)...) diff --git a/test/runtimes/proctor/java.go b/test/runtimes/proctor/java.go index 737fbe23e..d456fa681 100644 --- a/test/runtimes/proctor/java.go +++ b/test/runtimes/proctor/java.go @@ -64,8 +64,11 @@ func (javaRunner) ListTests() ([]string, error) { func (javaRunner) TestCmds(tests []string) []*exec.Cmd { args := append( []string{ - "-noreport", - "-dir:" + javaTestDir, + "-agentvm", // Execute each action using a pool of reusable JVMs. + "-dir:" + javaTestDir, // Base directory for test files and directories. + "-noreport", // Do not generate a final report. + "-timeoutFactor:20", // Extend the default timeout (2 min) of all tests by this factor. + "-verbose:nopass", // Verbose output but supress it for tests that passed. }, tests..., ) diff --git a/test/runtimes/proctor/nodejs.go b/test/runtimes/proctor/nodejs.go index 23d6edc72..dead5af4f 100644 --- a/test/runtimes/proctor/nodejs.go +++ b/test/runtimes/proctor/nodejs.go @@ -41,6 +41,6 @@ func (nodejsRunner) ListTests() ([]string, error) { // TestCmds implements TestRunner.TestCmds. func (nodejsRunner) TestCmds(tests []string) []*exec.Cmd { - args := append([]string{filepath.Join("tools", "test.py")}, tests...) + args := append([]string{filepath.Join("tools", "test.py"), "--timeout=180"}, tests...) return []*exec.Cmd{exec.Command("/usr/bin/python", args...)} } diff --git a/test/runtimes/runner/main.go b/test/runtimes/runner/main.go index e230912c9..948e7cf9c 100644 --- a/test/runtimes/runner/main.go +++ b/test/runtimes/runner/main.go @@ -40,7 +40,7 @@ var ( ) // Wait time for each test to run. -const timeout = 45 * time.Minute +const timeout = 90 * time.Minute func main() { flag.Parse() diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index c19b30b4a..a31612b41 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -1023,6 +1023,7 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:truncate_test", + vfs2 = "True", ) syscall_test( diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index 5467fa2c8..34016d4bd 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -1004,7 +1004,8 @@ TEST(FcntlTest, SetOwnPid) { pid_t pid; EXPECT_THAT(pid = getpid(), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(pid)); @@ -1018,7 +1019,8 @@ TEST(FcntlTest, SetOwnPgrp) { pid_t pgid; EXPECT_THAT(pgid = getpgrp(), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), + SyscallSucceedsWithValue(0)); // Verify with F_GETOWN_EX; using F_GETOWN on Linux may incorrectly treat the // negative return value as an error, converting the return value to -1 and @@ -1038,8 +1040,10 @@ TEST(FcntlTest, SetOwnUnset) { // Set and unset pid. pid_t pid; EXPECT_THAT(pid = getpid(), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), + SyscallSucceedsWithValue(0)); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(0)); @@ -1047,8 +1051,10 @@ TEST(FcntlTest, SetOwnUnset) { // Set and unset pgid. pid_t pgid; EXPECT_THAT(pgid = getpgrp(), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), + SyscallSucceedsWithValue(0)); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(0)); @@ -1120,7 +1126,7 @@ TEST(FcntlTest, SetOwnExTid) { EXPECT_THAT(owner.pid = syscall(__NR_gettid), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(owner.pid)); @@ -1136,7 +1142,7 @@ TEST(FcntlTest, SetOwnExPid) { EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(owner.pid)); @@ -1152,7 +1158,7 @@ TEST(FcntlTest, SetOwnExPgrp) { EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); // Verify with F_GETOWN_EX; using F_GETOWN on Linux may incorrectly treat the // negative return value as an error, converting the return value to -1 and @@ -1176,10 +1182,10 @@ TEST(FcntlTest, SetOwnExUnset) { owner.type = F_OWNER_PID; EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); owner.pid = 0; ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(0)); @@ -1188,10 +1194,10 @@ TEST(FcntlTest, SetOwnExUnset) { owner.type = F_OWNER_PGRP; EXPECT_THAT(owner.pid = getpgrp(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); owner.pid = 0; ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(0)); @@ -1207,7 +1213,7 @@ TEST(FcntlTest, GetOwnExTid) { EXPECT_THAT(set_owner.pid = syscall(__NR_gettid), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); f_owner_ex got_owner = {}; ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner), @@ -1225,7 +1231,7 @@ TEST(FcntlTest, GetOwnExPid) { EXPECT_THAT(set_owner.pid = getpid(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); f_owner_ex got_owner = {}; ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner), @@ -1243,7 +1249,7 @@ TEST(FcntlTest, GetOwnExPgrp) { EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); f_owner_ex got_owner = {}; ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner), diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index 220874aeb..5cb325a9e 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -18,6 +18,7 @@ #include <sys/epoll.h> #include <sys/inotify.h> #include <sys/ioctl.h> +#include <sys/sendfile.h> #include <sys/time.h> #include <sys/xattr.h> @@ -1681,6 +1682,60 @@ TEST(Inotify, EpollNoDeadlock) { } } +TEST(Inotify, Fallocate) { + const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR)); + + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS)); + + // Do an arbitrary modification with fallocate. + ASSERT_THAT(RetryEINTR(fallocate)(fd.get(), 0, 0, 123), SyscallSucceeds()); + std::vector<Event> events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({Event(IN_MODIFY, wd)})); +} + +TEST(Inotify, Sendfile) { + SKIP_IF(IsRunningWithVFS1()); + + const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(root.path(), "x", 0644)); + const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const FileDescriptor in = + ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY)); + const FileDescriptor out = + ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY)); + + // Create separate inotify instances for the in and out fds. If both watches + // were on the same instance, we would have discrepancies between Linux and + // gVisor (order of events, duplicate events), which is not that important + // since inotify is asynchronous anyway. + const FileDescriptor in_inotify = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const FileDescriptor out_inotify = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int in_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(in_inotify.get(), in_file.path(), IN_ALL_EVENTS)); + const int out_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(out_inotify.get(), out_file.path(), IN_ALL_EVENTS)); + + ASSERT_THAT(sendfile(out.get(), in.get(), /*offset=*/nullptr, 1), + SyscallSucceeds()); + + // Expect a single access event and a single modify event. + std::vector<Event> in_events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(in_inotify.get())); + std::vector<Event> out_events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(out_inotify.get())); + EXPECT_THAT(in_events, Are({Event(IN_ACCESS, in_wd)})); + EXPECT_THAT(out_events, Are({Event(IN_MODIFY, out_wd)})); +} + // On Linux, inotify behavior is not very consistent with splice(2). We try our // best to emulate Linux for very basic calls to splice. TEST(Inotify, SpliceOnWatchTarget) { diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc index 97e8d0f7e..46b6f38db 100644 --- a/test/syscalls/linux/mount.cc +++ b/test/syscalls/linux/mount.cc @@ -326,6 +326,14 @@ TEST(MountTest, MountFuseFilesystemNoDevice) { SKIP_IF(IsRunningOnGvisor() && !IsFUSEEnabled()); auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + + // Before kernel version 4.16-rc6, FUSE mount is protected by + // capable(CAP_SYS_ADMIN). After this version, it uses + // ns_capable(CAP_SYS_ADMIN) to protect. Before the 4.16 kernel, it was not + // allowed to mount fuse file systems without the global CAP_SYS_ADMIN. + int res = mount("", dir.path().c_str(), "fuse", 0, ""); + SKIP_IF(!IsRunningOnGvisor() && res == -1 && errno == EPERM); + EXPECT_THAT(mount("", dir.path().c_str(), "fuse", 0, ""), SyscallFailsWithErrno(EINVAL)); } @@ -339,6 +347,12 @@ TEST(MountTest, MountFuseFilesystem) { std::string mopts = "fd=" + std::to_string(fd.get()); auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + + // See comments in MountFuseFilesystemNoDevice for the reason why we skip + // EPERM when running on Linux. + int res = mount("", dir.path().c_str(), "fuse", 0, ""); + SKIP_IF(!IsRunningOnGvisor() && res == -1 && errno == EPERM); + auto const mount = ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "fuse", 0, mopts, 0)); } diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc index 3377b65cf..4fab097f4 100644 --- a/test/syscalls/linux/proc_net.cc +++ b/test/syscalls/linux/proc_net.cc @@ -477,6 +477,44 @@ TEST(ProcNetSnmp, CheckSnmp) { EXPECT_EQ(value_count, 1); } +TEST(ProcSysNetIpv4Recovery, Exists) { + EXPECT_THAT(open("/proc/sys/net/ipv4/tcp_recovery", O_RDONLY), + SyscallSucceeds()); +} + +TEST(ProcSysNetIpv4Recovery, CanReadAndWrite) { + // TODO(b/162988252): Enable save/restore for this test after the bug is + // fixed. + DisableSave ds; + + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_DAC_OVERRIDE)))); + + auto const fd = ASSERT_NO_ERRNO_AND_VALUE( + Open("/proc/sys/net/ipv4/tcp_recovery", O_RDWR)); + + char buf[10] = {'\0'}; + char to_write = '2'; + + // Check initial value is set to 1. + EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0), + SyscallSucceedsWithValue(sizeof(to_write) + 1)); + EXPECT_EQ(strcmp(buf, "1\n"), 0); + + // Set tcp_recovery to one of the allowed constants. + EXPECT_THAT(PwriteFd(fd.get(), &to_write, sizeof(to_write), 0), + SyscallSucceedsWithValue(sizeof(to_write))); + EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0), + SyscallSucceedsWithValue(sizeof(to_write) + 1)); + EXPECT_EQ(strcmp(buf, "2\n"), 0); + + // Set tcp_recovery to any random value. + char kMessage[] = "100"; + EXPECT_THAT(PwriteFd(fd.get(), kMessage, strlen(kMessage), 0), + SyscallSucceedsWithValue(strlen(kMessage))); + EXPECT_THAT(PreadFd(fd.get(), buf, sizeof(kMessage), 0), + SyscallSucceedsWithValue(sizeof(kMessage))); + EXPECT_EQ(strcmp(buf, "100\n"), 0); +} } // namespace } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/raw_socket_hdrincl.cc b/test/syscalls/linux/raw_socket_hdrincl.cc index 97f0467aa..2f25aceb2 100644 --- a/test/syscalls/linux/raw_socket_hdrincl.cc +++ b/test/syscalls/linux/raw_socket_hdrincl.cc @@ -178,7 +178,7 @@ TEST_F(RawHDRINCL, ConnectToLoopback) { } TEST_F(RawHDRINCL, SendWithoutConnectSucceeds) { - // FIXME(github.dev/issue/3159): Test currently flaky. + // FIXME(gvisor.dev/issue/3159): Test currently flaky. SKIP_IF(true); struct iphdr hdr = LoopbackHeader(); @@ -284,7 +284,7 @@ TEST_F(RawHDRINCL, SendAndReceive) { // Send and receive a packet where the sendto address is not the same as the // provided destination. TEST_F(RawHDRINCL, SendAndReceiveDifferentAddress) { - // FIXME(github.dev/issue/3160): Test currently flaky. + // FIXME(gvisor.dev/issue/3160): Test currently flaky. SKIP_IF(true); int port = 40000; diff --git a/test/syscalls/linux/socket_inet_loopback_nogotsan.cc b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc index 2324c7f6a..791e2bd51 100644 --- a/test/syscalls/linux/socket_inet_loopback_nogotsan.cc +++ b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc @@ -82,8 +82,11 @@ using SocketInetLoopbackTest = ::testing::TestWithParam<TestParam>; // This test verifies that connect returns EADDRNOTAVAIL if all local ephemeral // ports are already in use for a given destination ip/port. +// // We disable S/R because this test creates a large number of sockets. -TEST_P(SocketInetLoopbackTest, TestTCPPortExhaustion_NoRandomSave) { +// +// FIXME(b/162475855): This test is failing reliably. +TEST_P(SocketInetLoopbackTest, DISABLED_TestTCPPortExhaustion_NoRandomSave) { auto const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc index c2ecb639f..53c076787 100644 --- a/test/syscalls/linux/socket_ip_tcp_generic.cc +++ b/test/syscalls/linux/socket_ip_tcp_generic.cc @@ -34,6 +34,9 @@ namespace gvisor { namespace testing { +using ::testing::AnyOf; +using ::testing::Eq; + TEST_P(TCPSocketPairTest, TcpInfoSucceeds) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); @@ -800,6 +803,9 @@ TEST_P(TCPSocketPairTest, SetCongestionControlFailsForUnsupported) { // Linux and Netstack both default to a 60s TCP_LINGER2 timeout. constexpr int kDefaultTCPLingerTimeout = 60; +// On Linux, the maximum linger2 timeout was changed from 60sec to 120sec. +constexpr int kMaxTCPLingerTimeout = 120; +constexpr int kOldMaxTCPLingerTimeout = 60; TEST_P(TCPSocketPairTest, TCPLingerTimeoutDefault) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); @@ -827,12 +833,12 @@ TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutZeroOrLess) { SyscallSucceedsWithValue(0)); } -TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutAboveDefault) { +TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutAboveMax) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); // Values above the net.ipv4.tcp_fin_timeout are capped to tcp_fin_timeout // on linux (defaults to 60 seconds on linux). - constexpr int kAboveDefault = kDefaultTCPLingerTimeout + 1; + constexpr int kAboveDefault = kMaxTCPLingerTimeout + 1; EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &kAboveDefault, sizeof(kAboveDefault)), SyscallSucceedsWithValue(0)); @@ -843,7 +849,12 @@ TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutAboveDefault) { getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len), SyscallSucceedsWithValue(0)); EXPECT_EQ(get_len, sizeof(get)); - EXPECT_EQ(get, kDefaultTCPLingerTimeout); + if (IsRunningOnGvisor()) { + EXPECT_EQ(get, kMaxTCPLingerTimeout); + } else { + EXPECT_THAT(get, + AnyOf(Eq(kMaxTCPLingerTimeout), Eq(kOldMaxTCPLingerTimeout))); + } } TEST_P(TCPSocketPairTest, SetTCPLingerTimeout) { diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc index e6647a1c3..b3fcf8e7c 100644 --- a/test/syscalls/linux/socket_netlink_route.cc +++ b/test/syscalls/linux/socket_netlink_route.cc @@ -577,7 +577,10 @@ TEST(NetlinkRouteTest, GetRouteDump) { std::cout << std::endl; - if (msg->rtm_table == RT_TABLE_MAIN) { + // If the test is running in a new network namespace, it will have only + // the local route table. + if (msg->rtm_table == RT_TABLE_MAIN || + (!IsRunningOnGvisor() && msg->rtm_table == RT_TABLE_LOCAL)) { routeFound = true; dstFound = rtDstFound && dstFound; } diff --git a/tools/BUILD b/tools/BUILD index 34b950644..da83877b1 100644 --- a/tools/BUILD +++ b/tools/BUILD @@ -1 +1,9 @@ +load("//tools:defs.bzl", "bzl_library") + package(licenses = ["notice"]) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/tools/bazel.mk b/tools/bazel.mk index 88faa0190..3e27af7d1 100644 --- a/tools/bazel.mk +++ b/tools/bazel.mk @@ -33,8 +33,8 @@ GCLOUD_CONFIG := $(shell readlink -m ~/.config/gcloud/) DOCKER_SOCKET := /var/run/docker.sock # Bazel flags. -OPTIONS += --test_output=errors --keep_going --verbose_failures=true BAZEL := bazel $(STARTUP_OPTIONS) +OPTIONS += --color=no --curses=no # Basic options. UID := $(shell id -u ${USER}) @@ -48,7 +48,10 @@ FULL_DOCKER_RUN_OPTIONS += -v "$(BAZEL_CACHE):$(BAZEL_CACHE)" FULL_DOCKER_RUN_OPTIONS += -v "$(GCLOUD_CONFIG):$(GCLOUD_CONFIG)" FULL_DOCKER_RUN_OPTIONS += -v "/tmp:/tmp" FULL_DOCKER_EXEC_OPTIONS := --user $(UID):$(GID) -FULL_DOCKER_EXEC_OPTIONS += -i +FULL_DOCKER_EXEC_OPTIONS += --interactive +ifeq (true,$(shell [[ -t 0 ]] && echo true)) +FULL_DOCKER_EXEC_OPTIONS += --tty +endif # Add docker passthrough options. ifneq ($(DOCKER_PRIVILEGED),) @@ -140,12 +143,13 @@ bazel-server: ## Ensures that the server exists. Used as an internal target. @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) true || $(MAKE) bazel-server-start .PHONY: bazel-server -build_cmd = docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) sh -o pipefail -c '$(BAZEL) build $(OPTIONS) $(TARGETS)' +build_cmd = docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) sh -o pipefail -c '$(BAZEL) build $(OPTIONS) "$(TARGETS)"' build_paths = $(build_cmd) 2>&1 \ | tee /proc/self/fd/2 \ | grep -E "^ bazel-bin/" \ - | awk "{print $$1;}" \ + | tr -d '\r' \ + | awk '{$$1=$$1};1' \ | xargs -n 1 -I {} sh -c "$(1)" build: bazel-server @@ -166,10 +170,12 @@ sudo: bazel-server @$(call build_paths,sudo -E {} $(ARGS)) .PHONY: sudo +test: OPTIONS += --test_output=errors --keep_going --verbose_failures=true test: bazel-server @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) $(BAZEL) test $(OPTIONS) $(TARGETS) .PHONY: test -query: bazel-server - @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) $(BAZEL) query $(OPTIONS) '$(TARGETS)' +query: + @$(MAKE) bazel-server >&2 # If we need to start, ensure stdout is not polluted. + @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) sh -o pipefail -c '$(BAZEL) query $(OPTIONS) "$(TARGETS)" 2>/dev/null' .PHONY: query diff --git a/tools/bazeldefs/BUILD b/tools/bazeldefs/BUILD index 3f809065d..8d4356119 100644 --- a/tools/bazeldefs/BUILD +++ b/tools/bazeldefs/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "rbe_platform", "rbe_toolchain") +load("//tools:defs.bzl", "bzl_library", "rbe_platform", "rbe_toolchain") package(licenses = ["notice"]) @@ -86,3 +86,21 @@ rbe_toolchain( toolchain = "@bazel_toolchains_bazel3//configs/ubuntu16_04_clang/11.0.0/bazel_3.1.0/cc:cc-compiler-k8", toolchain_type = "@bazel_tools//tools/cpp:toolchain_type", ) + +bzl_library( + name = "platforms_bzl", + srcs = ["platforms.bzl"], + visibility = ["//visibility:private"], +) + +bzl_library( + name = "tags_bzl", + srcs = ["tags.bzl"], + visibility = ["//visibility:private"], +) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl index 3db8e13d0..db7f379b8 100644 --- a/tools/bazeldefs/defs.bzl +++ b/tools/bazeldefs/defs.bzl @@ -2,15 +2,16 @@ load("@bazel_gazelle//:def.bzl", _gazelle = "gazelle") load("@bazel_skylib//rules:build_test.bzl", _build_test = "build_test") +load("@bazel_skylib//:bzl_library.bzl", _bzl_library = "bzl_library") load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier") load("@io_bazel_rules_go//go:def.bzl", "GoLibrary", _go_binary = "go_binary", _go_context = "go_context", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_path = "go_path", _go_test = "go_test") load("@io_bazel_rules_go//proto:def.bzl", _go_grpc_library = "go_grpc_library", _go_proto_library = "go_proto_library") load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test") load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar") -load("@pydeps//:requirements.bzl", _py_requirement = "requirement") load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", _cc_grpc_library = "cc_grpc_library") build_test = _build_test +bzl_library = _bzl_library cc_library = _cc_library cc_flags_supplier = _cc_flags_supplier cc_proto_library = _cc_proto_library @@ -25,9 +26,7 @@ gbenchmark = "@com_google_benchmark//:benchmark" loopback = "//tools/bazeldefs:loopback" pkg_deb = _pkg_deb pkg_tar = _pkg_tar -py_library = native.py_library py_binary = native.py_binary -py_test = native.py_test rbe_platform = native.platform rbe_toolchain = native.toolchain vdso_linker_option = "-fuse-ld=gold " @@ -163,9 +162,6 @@ def go_context(ctx): tags = go_ctx.tags, ) -def py_requirement(name, direct = True): - return _py_requirement(name) - def select_arch(amd64 = "amd64", arm64 = "arm64", default = None, **kwargs): values = { "@bazel_tools//src/conditions:linux_x86_64": amd64, diff --git a/tools/defs.bzl b/tools/defs.bzl index e35e29634..e71a26cf4 100644 --- a/tools/defs.bzl +++ b/tools/defs.bzl @@ -7,13 +7,14 @@ change for Google-internal and bazel-compatible rules. load("//tools/go_stateify:defs.bzl", "go_stateify") load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps") -load("//tools/bazeldefs:defs.bzl", _build_test = "build_test", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _default_installer = "default_installer", _default_net_util = "default_net_util", _gazelle = "gazelle", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_library = "go_library", _go_path = "go_path", _go_proto_library = "go_proto_library", _go_test = "go_test", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _rbe_platform = "rbe_platform", _rbe_toolchain = "rbe_toolchain", _select_arch = "select_arch", _select_system = "select_system", _short_path = "short_path", _vdso_linker_option = "vdso_linker_option") +load("//tools/bazeldefs:defs.bzl", _build_test = "build_test", _bzl_library = "bzl_library", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _default_installer = "default_installer", _default_net_util = "default_net_util", _gazelle = "gazelle", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_library = "go_library", _go_path = "go_path", _go_proto_library = "go_proto_library", _go_test = "go_test", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _rbe_platform = "rbe_platform", _rbe_toolchain = "rbe_toolchain", _select_arch = "select_arch", _select_system = "select_system", _short_path = "short_path", _vdso_linker_option = "vdso_linker_option") load("//tools/bazeldefs:platforms.bzl", _default_platform = "default_platform", _platforms = "platforms") load("//tools/bazeldefs:tags.bzl", "go_suffixes") load("//tools/nogo:defs.bzl", "nogo_test") # Delegate directly. build_test = _build_test +bzl_library = _bzl_library cc_binary = _cc_binary cc_flags_supplier = _cc_flags_supplier cc_grpc_library = _cc_grpc_library @@ -33,9 +34,6 @@ loopback = _loopback pkg_deb = _pkg_deb pkg_tar = _pkg_tar py_binary = _py_binary -py_library = _py_library -py_requirement = _py_requirement -py_test = _py_test select_arch = _select_arch select_system = _select_system short_path = _short_path diff --git a/tools/go_branch.sh b/tools/go_branch.sh index 093de89b4..e5c060024 100755 --- a/tools/go_branch.sh +++ b/tools/go_branch.sh @@ -40,10 +40,15 @@ trap finish EXIT # Record the current working commit. declare -r head=$(git describe --always) -# We expect to have an existing go branch that we will use as the basis for -# this commit. That branch may be empty, but it must exist. +# We expect to have an existing go branch that we will use as the basis for this +# commit. That branch may be empty, but it must exist. We search for this branch +# using the local branch, the "origin" branch, and other remotes, in order. git fetch --all -declare -r go_branch=$(git show-ref --hash go) +declare -r go_branch=$( \ + git show-ref --hash refs/heads/go || \ + git show-ref --hash refs/remotes/origin/go || \ + git show-ref --hash go | head -n 1 \ +) # Clone the current repository to the temporary directory, and check out the # current go_branch directory. We move to the new repository for convenience. @@ -66,6 +71,11 @@ git checkout -b go "${go_branch}" git merge --no-commit --strategy ours ${head} || \ git merge --allow-unrelated-histories --no-commit --strategy ours ${head} +# Normalize the permissions on the old branch. Note that they should be +# normalized if constructed by this tool, but we do so before the rsync. +find . -type f -exec chmod 0644 {} \; +find . -type d -exec chmod 0755 {} \; + # Sync the entire gopath_dir. rsync --recursive --verbose --delete --exclude .git -L "${gopath_dir}/" . @@ -86,7 +96,11 @@ EOF # There are a few solitary files that can get left behind due to the way bazel # constructs the gopath target. Note that we don't find all Go files here # because they may correspond to unused templates, etc. -cp "${repo_orig}"/runsc/*.go runsc/ +declare -ar binaries=( "runsc" "shim/v1" "shim/v2" ) +for target in "${binaries[@]}"; do + mkdir -p "${target}" + cp "${repo_orig}/${target}"/*.go "${target}/" +done # Normalize all permissions. The way bazel constructs the :gopath tree may leave # some strange permissions on files. We don't have anything in this tree that diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD index 558826bf1..807c08ead 100644 --- a/tools/go_generics/BUILD +++ b/tools/go_generics/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_binary") +load("//tools:defs.bzl", "bzl_library", "go_binary") package(licenses = ["notice"]) @@ -12,3 +12,9 @@ go_binary( visibility = ["//:sandbox"], deps = ["//tools/go_generics/globals"], ) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/tools/go_generics/tests/BUILD b/tools/go_generics/tests/BUILD index e69de29bb..7547a6b53 100644 --- a/tools/go_generics/tests/BUILD +++ b/tools/go_generics/tests/BUILD @@ -0,0 +1,7 @@ +load("//tools:defs.bzl", "bzl_library") + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/tools/go_marshal/BUILD b/tools/go_marshal/BUILD index be49cf9c8..f79defea7 100644 --- a/tools/go_marshal/BUILD +++ b/tools/go_marshal/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_binary") +load("//tools:defs.bzl", "bzl_library", "go_binary") licenses(["notice"]) @@ -17,3 +17,9 @@ config_setting( name = "marshal_config_verbose", values = {"define": "gomarshal=verbose"}, ) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD index 503cdf2e5..913558b4e 100644 --- a/tools/go_stateify/BUILD +++ b/tools/go_stateify/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_binary") +load("//tools:defs.bzl", "bzl_library", "go_binary") package(licenses = ["notice"]) @@ -8,3 +8,9 @@ go_binary( visibility = ["//:sandbox"], deps = ["//tools/tags"], ) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/tools/make_release.sh b/tools/make_release.sh index b1cdd47b0..9137dd9bb 100755 --- a/tools/make_release.sh +++ b/tools/make_release.sh @@ -43,8 +43,7 @@ install_raw() { # Copy the raw file & generate a sha512sum. name=$(basename "${binary}") cp -f "${binary}" "${root}/$1" - sha512sum "${root}/$1/${name}" | \ - awk "{print $$1 \" ${name}\"}" > "${root}/$1/${name}.sha512" + (cd "${root}/$1" && sha512sum "${name}" > "${name}.sha512") done } diff --git a/tools/nogo/BUILD b/tools/nogo/BUILD index c21b09511..e1bfb9a2c 100644 --- a/tools/nogo/BUILD +++ b/tools/nogo/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "bzl_library", "go_library") package(licenses = ["notice"]) @@ -47,3 +47,9 @@ go_library( "@org_golang_x_tools//go/gcexportdata:go_tool_library", ], ) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/tools/vm/BUILD b/tools/vm/BUILD index f7160c627..d95ca6c63 100644 --- a/tools/vm/BUILD +++ b/tools/vm/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "cc_binary", "gtest") +load("//tools:defs.bzl", "bzl_library", "cc_binary", "gtest") load("//tools/vm:defs.bzl", "vm_image", "vm_test") package( @@ -55,3 +55,9 @@ vm_test( shard_count = 2, targets = [":test"], ) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/website/BUILD b/website/BUILD index 10e0299ae..7b61d13c8 100644 --- a/website/BUILD +++ b/website/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "pkg_tar") +load("//tools:defs.bzl", "bzl_library", "pkg_tar") load("//website:defs.bzl", "doc", "docs") package(licenses = ["notice"]) @@ -149,6 +149,9 @@ docs( "//g3doc/user_guide:install", "//g3doc/user_guide:networking", "//g3doc/user_guide:platforms", + "//g3doc/user_guide/containerd:configuration", + "//g3doc/user_guide/containerd:containerd_11", + "//g3doc/user_guide/containerd:quick_start", "//g3doc/user_guide/quick_start:docker", "//g3doc/user_guide/quick_start:kubernetes", "//g3doc/user_guide/quick_start:oci", @@ -177,3 +180,9 @@ genrule( "rm -rf $$T", tools = ["//website/cmd/syscalldocs"], ) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/website/_layouts/docs.html b/website/_layouts/docs.html index 6bc5d87db..0422f9fb0 100644 --- a/website/_layouts/docs.html +++ b/website/_layouts/docs.html @@ -14,30 +14,25 @@ categories: {% for category in layout.categories %} <h3>{{ category }}</h3> <ul class="sidebar-nav"> - {% assign sorted_pages = site.pages | where: 'layout', 'docs' | where: 'category', category | sort: 'weight' | sort: 'subcategory' %} - {% assign subcategory = nil %} - {% for p in sorted_pages %} - {% if p.subcategory != subcategory %} - {% if subcategory != nil %} - </ul> - </li> - {% endif %} - {% assign subcategory = p.subcategory %} - {% if subcategory != nil %} - {% assign ac = "aria-controls" %} - {% assign cid = p.category | remove: " " | downcase %} - {% assign sid = p.subcategory | remove: " " | downcase %} - <li> - <a class="sidebar-nav-heading" data-toggle="collapse" href="#{{ cid }}-{{ sid }}" aria-expanded="false" {{ ac }}="{{ cid }}-{{ sid }}">{{ subcategory }}<span class="caret"></span></a> - <ul class="collapse sidebar-nav sidebar-submenu" id="{{ cid }}-{{ sid }}"> - {% endif %} + {% assign subcats = site.pages | where: 'layout', 'docs' | where: 'category', category | group_by: 'subcategory' | sort: 'name', 'first' %} + {% for subcategory in subcats %} + {% assign sorted_pages = subcategory.items | sort: 'weight', 'last' %} + {% if subcategory.name != "" %} + {% assign ac = "aria-controls" %} + {% assign cid = category | remove: " " | downcase %} + {% assign sid = subcategory.name | remove: " " | downcase %} + <li> + <a class="sidebar-nav-heading" data-toggle="collapse" href="#{{ cid }}-{{ sid }}" aria-expanded="false" {{ ac }}="{{ cid }}-{{ sid }}">{{ subcategory.name }}<span class="caret"></span></a> + <ul class="collapse sidebar-nav sidebar-submenu" id="{{ cid }}-{{ sid }}"> {% endif %} - <li><a href="{{ p.url }}">{{ p.title }}</a></li> - {% endfor %} - {% if subcategory != nil %} - </ul> + {% for p in sorted_pages %} + <li><a href="{{ p.url }}">{{ p.title }}</a></li> + {% endfor %} + {% if subcategory.name != "" %} </li> - {% endif %} + </ul> + {% endif %} + {% endfor %} </ul> {% endfor %} </nav> diff --git a/website/blog/2020-04-02-networking-security.md b/website/blog/2020-04-02-networking-security.md index 5a5e38fd7..f3ce02d11 100644 --- a/website/blog/2020-04-02-networking-security.md +++ b/website/blog/2020-04-02-networking-security.md @@ -108,7 +108,7 @@ re-architecting the TCP implementation to use fewer goroutines. Performance today is good enough for most applications and we are making steady improvements. For example, since May of 2019, we have improved the Netstack runsc -[iperf3 download benchmark](https://github.com/google/gvisor/blob/master/benchmarks/suites/network.py) +[iperf3 download benchmark](https://github.com/google/gvisor/tree/master/test/benchmarks/network) score by roughly 15% and upload score by around 10,000X. Current numbers are about 17 Gbps download and about 8 Gbps upload versus about 42 Gbps and 43 Gbps for native (Linux) respectively. diff --git a/website/performance/README.md b/website/performance/README.md index 0dbfd2f02..1758fc608 100644 --- a/website/performance/README.md +++ b/website/performance/README.md @@ -1,9 +1,10 @@ # Performance data -This directory holds the CSVs generated by the -[benchmark-tools][benchmark-tools] repository. +This directory holds the CSVs generated by the now removed benchmark-tools +repository. The new functionally equivalent +[benchmark-tools is available.][benchmark-tools] In the future, these will be automatically posted to a cloud storage bucket and loaded dynamically. At that point, this directory will be removed. -[benchmark-tools]: https://github.com/google/gvisor/tree/master/benchmarks +[benchmark-tools]: https://github.com/google/gvisor/tree/master/test/benchmarks |