diff options
435 files changed, 28815 insertions, 5251 deletions
@@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Build with C++17. +build --cxxopt=-std=c++17 + # Display the current git revision in the info block. build --stamp --workspace_status_command tools/workspace_status.sh @@ -23,7 +23,18 @@ go_path( "//runsc", # Packages that are not dependencies of //runsc. + "//pkg/sentry/kernel/memevent", + "//pkg/tcpip/adapters/gonet", "//pkg/tcpip/link/channel", + "//pkg/tcpip/link/muxed", + "//pkg/tcpip/link/sharedmem", + "//pkg/tcpip/link/sharedmem/pipe", + "//pkg/tcpip/link/sharedmem/queue", + "//pkg/tcpip/link/tun", + "//pkg/tcpip/link/waitable", + "//pkg/tcpip/sample/tun_tcp_connect", + "//pkg/tcpip/sample/tun_tcp_echo", + "//pkg/tcpip/transport/tcpconntrack", ], ) diff --git a/Dockerfile b/Dockerfile index 6e9d870db..738623023 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ FROM ubuntu:bionic -RUN apt-get update && apt-get install -y curl gnupg2 git python3 +RUN apt-get update && apt-get install -y curl gnupg2 git python python3 python3-distutils python3-pip RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \ - curl https://bazel.build/bazel-release.pub.gpg | apt-key add - + curl https://bazel.build/bazel-release.pub.gpg | apt-key add - RUN apt-get update && apt-get install -y bazel && apt-get clean WORKDIR /gvisor @@ -22,7 +22,7 @@ bazel-server-start: docker-build --privileged \ gvisor-bazel \ sh -c "while :; do sleep 100; done" && \ - docker exec --user 0:0 -i gvisor-bazel sh -c "groupadd --gid $(GID) --non-unique gvisor && useradd --uid $(UID) --gid $(GID) -d $(HOME) gvisor" + docker exec --user 0:0 -i gvisor-bazel sh -c "groupadd --gid $(GID) --non-unique gvisor && useradd --uid $(UID) --non-unique --gid $(GID) -d $(HOME) gvisor" bazel-server: docker exec gvisor-bazel true || \ @@ -48,9 +48,10 @@ Make sure the following dependencies are installed: * Linux 4.14.77+ ([older linux][old-linux]) * [git][git] -* [Bazel][bazel] 0.28.0+ +* [Bazel][bazel] 1.2+ * [Python][python] * [Docker version 17.09.0 or greater][docker] +* C++ toolchain supporting C++17 (GCC 7+, Clang 5+) * Gold linker (e.g. `binutils-gold` package on Ubuntu) ### Building @@ -1,6 +1,7 @@ -# Load go bazel rules and gazelle. load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") +load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") +# Load go bazel rules and gazelle. http_archive( name = "io_bazel_rules_go", sha256 = "b9aa86ec08a292b97ec4591cf578e020b35f98e12173bbd4a921f84f583aebd9", @@ -58,6 +59,26 @@ load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps") protobuf_deps() +# Load python dependencies. +git_repository( + name = "rules_python", + commit = "94677401bc56ed5d756f50b441a6a5c7f735a6d4", + remote = "https://github.com/bazelbuild/rules_python.git", + shallow_since = "1573842889 -0500", +) + +load("@rules_python//python:pip.bzl", "pip_import") + +pip_import( + name = "pydeps", + python_interpreter = "python3", + requirements = "//benchmarks:requirements.txt", +) + +load("@pydeps//:requirements.bzl", "pip_install") + +pip_install() + # Load bazel_toolchain to support Remote Build Execution. # See releases at https://releases.bazel.build/bazel-toolchains.html http_archive( @@ -85,6 +106,45 @@ load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies") rules_pkg_dependencies() +# Container rules. +http_archive( + name = "io_bazel_rules_docker", + sha256 = "14ac30773fdb393ddec90e158c9ec7ebb3f8a4fd533ec2abbfd8789ad81a284b", + strip_prefix = "rules_docker-0.12.1", + urls = ["https://github.com/bazelbuild/rules_docker/releases/download/v0.12.1/rules_docker-v0.12.1.tar.gz"], +) + +load( + "@io_bazel_rules_docker//repositories:repositories.bzl", + container_repositories = "repositories", +) + +container_repositories() + +load("@io_bazel_rules_docker//repositories:deps.bzl", container_deps = "deps") + +container_deps() + +load( + "@io_bazel_rules_docker//container:container.bzl", + "container_pull", +) + +# This container is built from the Dockerfile in test/iptables/runner. +container_pull( + name = "iptables-test", + registry = "gcr.io", + repository = "gvisor-presubmit/iptables-test", + digest = "sha256:a137d692a2eb9fc7bf95c5f4a568da090e2c31098e93634421ed88f3a3f1db65", +) + +load( + "@io_bazel_rules_docker//go:image.bzl", + _go_image_repos = "repositories", +) + +_go_image_repos() + # External repositories, in sorted order. go_repository( name = "com_github_cenkalti_backoff", diff --git a/benchmarks/BUILD b/benchmarks/BUILD new file mode 100644 index 000000000..dbadeeaf2 --- /dev/null +++ b/benchmarks/BUILD @@ -0,0 +1,9 @@ +package(licenses = ["notice"]) + +py_binary( + name = "benchmarks", + srcs = ["run.py"], + main = "run.py", + python_version = "PY3", + deps = ["//benchmarks/runner"], +) diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..ad44cd6ac --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,172 @@ +# Benchmark tools + +These scripts are tools for collecting performance data for Docker-based tests. + +## Setup + +The scripts assume the following: + +* You have a local machine with bazel installed. +* You have some machine(s) with docker installed. These machines will be + refered to as the "Environment". +* Environment machines have the runtime(s) under test installed, such that you + can run docker with a command like: `docker run --runtime=$RUNTIME + your/image`. +* You are able to login to machines in the environment with the local machine + via ssh and the user for ssh can run docker commands without using `sudo`. +* The docker daemon on each of your environment machines is listening on + `unix:///var/run/docker.sock` (docker's default). + +For configuring the environment manually, consult the +[dockerd documentation][dockerd]. + +## Environment + +All benchmarks require a user defined yaml file describe the environment. These +files are of the form: + +```yaml +machine1: local +machine2: + hostname: 100.100.100.100 + username: username + key_path: ~/private_keyfile + key_password: passphrase +machine3: + hostname: 100.100.100.101 + username: username + key_path: ~/private_keyfile + key_password: passphrase +``` + +The yaml file defines an environment with three machines named `machine1`, +`machine2` and `machine3`. `machine1` is the local machine, `machine2` and +`machine3` are remote machines. Both `machine2` and `machine3` should be +reachable by `ssh`. For example, the command `ssh -i ~/private_keyfile +username@100.100.100.100` (using the passphrase `passphrase`) should connect to +`machine2`. + +The above is an example only. Machines should be uniform, since they are treated +as such by the tests. Machines must also be accessible to each other via their +default routes. Furthermore, some benchmarks will meaningless if running on the +local machine, such as density. + +For remote machines, `hostname`, `key_path`, and `username` are required and +others are optional. In addition key files must be generated +[using the instrcutions below](#generating-ssh-keys). + +The above yaml file can be checked for correctness with the `validate` command +in the top level perf.py script: + +`bazel run :benchmarks -- validate $PWD/examples/localhost.yaml` + +## Running benchmarks + +To list available benchmarks, use the `list` commmand: + +```bash +bazel run :benchmarks -- list + +... +Benchmark: sysbench.cpu +Metrics: events_per_second + Run sysbench CPU test. Additional arguments can be provided for sysbench. + + :param max_prime: The maximum prime number to search. +``` + +To run benchmarks, use the `run` command. For example, to run the sysbench +benchmark above: + +```bash +bazel run :benchmarks -- run --env $PWD/examples/localhost.yaml sysbench.cpu +``` + +You can run parameterized benchmarks, for example to run with different +runtimes: + +```bash +bazel run :benchmarks -- run --env $PWD/examples/localhost.yaml --runtime=runc --runtime=runsc sysbench.cpu +``` + +Or with different parameters: + +```bash +bazel run :benchmarks -- run --env $PWD/examples/localhost.yaml --max_prime=10 --max_prime=100 sysbench.cpu +``` + +## Writing benchmarks + +To write new benchmarks, you should familiarize yourself with the structure of +the repository. There are three key components. + +## Harness + +The harness makes use of the [docker py SDK][docker-py]. It is advisable that +you familiarize yourself with that API when making changes, specifically: + +* clients +* containers +* images + +In general, benchmarks need only interact with the `Machine` objects provided to +the benchmark function, which are the machines defined in the environment. These +objects allow the benchmark to define the relationships between different +containers, and parse the output. + +## Workloads + +The harness requires workloads to run. These are all available in the +`workloads` directory. + +In general, a workload consists of a Dockerfile to build it (while these are not +hermetic, in general they should be as fixed and isolated as possible), some +parses for output if required, parser tests and sample data. Provided the test +is named after the workload package and contains a function named `sample`, this +variable will be used to automatically mock workload output when the `--mock` +flag is provided to the main tool. + +## Writing benchmarks + +Benchmarks define the tests themselves. All benchmarks have the following +function signature: + +```python +def my_func(output) -> float: + return float(output) + +@benchmark(metrics = my_func, machines = 1) +def my_benchmark(machine: machine.Machine, arg: str): + return "3.4432" +``` + +Each benchmark takes a variable amount of position arguments as +`harness.Machine` objects and some set of keyword arguments. It is recommended +that you accept arbitrary keyword arguments and pass them through when +constructing the container under test. + +To write a new benchmark, open a module in the `suites` directory and use the +above signature. You should add a descriptive doc string to describe what your +benchmark is and any test centric arguments. + +## Generating SSH Keys + +The scripts only support RSA Keys, and ssh library used in paramiko. Paramiko +only supports RSA keys that look like the following (PEM format): + +```bash +$ cat /path/to/ssh/key + +-----BEGIN RSA PRIVATE KEY----- +...private key text... +-----END RSA PRIVATE KEY----- + +``` + +To generate ssh keys in PEM format, use the [`-t rsa -m PEM -b 4096`][RSA-keys]. +option. + +[dockerd]: https://docs.docker.com/engine/reference/commandline/dockerd/ +[docker-py]: https://docker-py.readthedocs.io/en/stable/ +[paramiko]: http://docs.paramiko.org/en/2.4/api/client.html +[RSA-keys]: https://serverfault.com/questions/939909/ssh-keygen-does-not-create-rsa-private-key diff --git a/benchmarks/defs.bzl b/benchmarks/defs.bzl new file mode 100644 index 000000000..79e6cdbc8 --- /dev/null +++ b/benchmarks/defs.bzl @@ -0,0 +1,18 @@ +"""Provides python helper functions.""" + +load("@pydeps//:requirements.bzl", _requirement = "requirement") + +def filter_deps(deps = None): + if deps == None: + deps = [] + return [dep for dep in deps if dep] + +def py_library(deps = None, **kwargs): + return native.py_library(deps = filter_deps(deps), **kwargs) + +def py_test(deps = None, **kwargs): + return native.py_test(deps = filter_deps(deps), **kwargs) + +def requirement(name, direct = True): + """ requirement returns the required dependency. """ + return _requirement(name) diff --git a/benchmarks/examples/localhost.yaml b/benchmarks/examples/localhost.yaml new file mode 100644 index 000000000..f70fe0fb7 --- /dev/null +++ b/benchmarks/examples/localhost.yaml @@ -0,0 +1,2 @@ +client: localhost +server: localhost diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD new file mode 100644 index 000000000..9546220c4 --- /dev/null +++ b/benchmarks/harness/BUILD @@ -0,0 +1,89 @@ +load("//benchmarks:defs.bzl", "py_library", "requirement") + +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "harness", + srcs = ["__init__.py"], +) + +py_library( + name = "benchmark_driver", + srcs = ["benchmark_driver.py"], + deps = [ + "//benchmarks/harness/machine_mocks", + "//benchmarks/harness/machine_producers:machine_producer", + "//benchmarks/suites", + ], +) + +py_library( + name = "container", + srcs = ["container.py"], + deps = [ + requirement("asn1crypto", False), + requirement("chardet", False), + requirement("certifi", False), + requirement("docker", True), + requirement("docker-pycreds", False), + requirement("idna", False), + requirement("ptyprocess", False), + requirement("requests", False), + requirement("urllib3", False), + requirement("websocket-client", False), + ], +) + +py_library( + name = "machine", + srcs = ["machine.py"], + deps = [ + "//benchmarks/harness", + "//benchmarks/harness:container", + "//benchmarks/harness:ssh_connection", + "//benchmarks/harness:tunnel_dispatcher", + requirement("asn1crypto", False), + requirement("chardet", False), + requirement("certifi", False), + requirement("docker", True), + requirement("docker-pycreds", False), + requirement("idna", False), + requirement("ptyprocess", False), + requirement("requests", False), + requirement("urllib3", False), + requirement("websocket-client", False), + ], +) + +py_library( + name = "ssh_connection", + srcs = ["ssh_connection.py"], + deps = [ + "//benchmarks/harness", + requirement("bcrypt", False), + requirement("cffi", False), + requirement("paramiko", True), + requirement("cryptography", False), + ], +) + +py_library( + name = "tunnel_dispatcher", + srcs = ["tunnel_dispatcher.py"], + deps = [ + requirement("asn1crypto", False), + requirement("chardet", False), + requirement("certifi", False), + requirement("docker", True), + requirement("docker-pycreds", False), + requirement("idna", False), + requirement("pexpect", True), + requirement("ptyprocess", False), + requirement("requests", False), + requirement("urllib3", False), + requirement("websocket-client", False), + ], +) diff --git a/benchmarks/harness/__init__.py b/benchmarks/harness/__init__.py new file mode 100644 index 000000000..a7f34da9e --- /dev/null +++ b/benchmarks/harness/__init__.py @@ -0,0 +1,25 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Core benchmark utilities.""" + +import os + +# LOCAL_WORKLOADS_PATH defines the path to use for local workloads. This is a +# format string that accepts a single string parameter. +LOCAL_WORKLOADS_PATH = os.path.join( + os.path.dirname(__file__), "../workloads/{}") + +# REMOTE_WORKLOADS_PATH defines the path to use for storing the workloads on the +# remote host. This is a format string that accepts a single string parameter. +REMOTE_WORKLOADS_PATH = "workloads/{}" diff --git a/benchmarks/harness/benchmark_driver.py b/benchmarks/harness/benchmark_driver.py new file mode 100644 index 000000000..9abc21b54 --- /dev/null +++ b/benchmarks/harness/benchmark_driver.py @@ -0,0 +1,85 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Main driver for benchmarks.""" + +import copy +import statistics +import threading +import types + +from benchmarks import suites +from benchmarks.harness.machine_producers import machine_producer + + +# pylint: disable=too-many-instance-attributes +class BenchmarkDriver: + """Allocates machines and invokes a benchmark method.""" + + def __init__(self, + producer: machine_producer.MachineProducer, + method: types.FunctionType, + runs: int = 1, + **kwargs): + + self._producer = producer + self._method = method + self._kwargs = copy.deepcopy(kwargs) + self._threads = [] + self.lock = threading.RLock() + self._runs = runs + self._metric_results = {} + + def start(self): + """Starts a benchmark thread.""" + for _ in range(self._runs): + thread = threading.Thread(target=self._run_method) + thread.start() + self._threads.append(thread) + + def join(self): + """Joins the thread.""" + # pylint: disable=expression-not-assigned + [t.join() for t in self._threads] + + def _run_method(self): + """Runs all benchmarks.""" + machines = self._producer.get_machines( + suites.benchmark_machines(self._method)) + try: + result = self._method(*machines, **self._kwargs) + for name, res in result: + with self.lock: + if name in self._metric_results: + self._metric_results[name].append(res) + else: + self._metric_results[name] = [res] + finally: + # Always release. + self._producer.release_machines(machines) + + def median(self): + """Returns the median result, after join is finished.""" + for key, value in self._metric_results.items(): + yield key, [statistics.median(value)] + + def all(self): + """Returns all results.""" + for key, value in self._metric_results.items(): + yield key, value + + def meanstd(self): + """Returns all results.""" + for key, value in self._metric_results.items(): + mean = statistics.mean(value) + yield key, [mean, statistics.stdev(value, xbar=mean)] diff --git a/benchmarks/harness/container.py b/benchmarks/harness/container.py new file mode 100644 index 000000000..585436e20 --- /dev/null +++ b/benchmarks/harness/container.py @@ -0,0 +1,181 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Container definitions.""" + +import contextlib +import logging +import pydoc +import types +from typing import Tuple + +import docker +import docker.errors + +from benchmarks import workloads + + +class Container: + """Abstract container. + + Must be a context manager. + + Usage: + + with Container(client, image, ...): + ... + """ + + def run(self, **env) -> str: + """Run the container synchronously.""" + raise NotImplementedError + + def detach(self, **env): + """Run the container asynchronously.""" + raise NotImplementedError + + def address(self) -> Tuple[str, int]: + """Return the bound address for the container.""" + raise NotImplementedError + + def get_names(self) -> types.GeneratorType: + """Return names of all containers.""" + raise NotImplementedError + + +# pylint: disable=too-many-instance-attributes +class DockerContainer(Container): + """Class that handles creating a docker container.""" + + # pylint: disable=too-many-arguments + def __init__(self, + client: docker.DockerClient, + host: str, + image: str, + count: int = 1, + runtime: str = "runc", + port: int = 0, + **kwargs): + """Trys to setup "count" containers. + + Args: + client: A docker client from dockerpy. + host: The host address the image is running on. + image: The name of the image to run. + count: The number of containers to setup. + runtime: The container runtime to use. + port: The port to reserve. + **kwargs: Additional container options. + """ + assert count >= 1 + assert port == 0 or count == 1 + self._client = client + self._host = host + self._containers = [] + self._count = count + self._image = image + self._runtime = runtime + self._port = port + self._kwargs = kwargs + if port != 0: + self._ports = {"%d/tcp" % port: None} + else: + self._ports = {} + + @contextlib.contextmanager + def detach(self, **env): + env = ["%s=%s" % (key, value) for (key, value) in env.items()] + # Start all containers. + for _ in range(self._count): + try: + # Start the container in a detached mode. + container = self._client.containers.run( + self._image, + detach=True, + remove=True, + runtime=self._runtime, + ports=self._ports, + environment=env, + **self._kwargs) + logging.info("Started detached container %s -> %s", self._image, + container.attrs["Id"]) + self._containers.append(container) + except Exception as exc: + self._clean_containers() + raise exc + try: + # Wait for all containers to be up. + for container in self._containers: + while not container.attrs["State"]["Running"]: + container = self._client.containers.get(container.attrs["Id"]) + yield self + finally: + self._clean_containers() + + def address(self) -> Tuple[str, int]: + assert self._count == 1 + assert self._port != 0 + container = self._client.containers.get(self._containers[0].attrs["Id"]) + port = container.attrs["NetworkSettings"]["Ports"][ + "%d/tcp" % self._port][0]["HostPort"] + return (self._host, port) + + def get_names(self) -> types.GeneratorType: + for container in self._containers: + yield container.name + + def run(self, **env) -> str: + env = ["%s=%s" % (key, value) for (key, value) in env.items()] + return self._client.containers.run( + self._image, + runtime=self._runtime, + ports=self._ports, + remove=True, + environment=env, + **self._kwargs).decode("utf-8") + + def _clean_containers(self): + """Kills all containers.""" + for container in self._containers: + try: + container.kill() + except docker.errors.NotFound: + pass + + +class MockContainer(Container): + """Mock of Container.""" + + def __init__(self, workload: str): + self._workload = workload + + def __enter__(self): + return self + + def run(self, **env): + # Lookup sample data if any exists for the workload module. We use a + # well-defined test locate and a well-defined sample function. + mod = pydoc.locate(workloads.__name__ + "." + self._workload) + if hasattr(mod, "sample"): + return mod.sample(**env) + return "" # No output. + + def address(self) -> Tuple[str, int]: + return ("example.com", 80) + + def get_names(self) -> types.GeneratorType: + yield "mock" + + @contextlib.contextmanager + def detach(self, **env): + yield self diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py new file mode 100644 index 000000000..66b719b63 --- /dev/null +++ b/benchmarks/harness/machine.py @@ -0,0 +1,224 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Machine abstraction passed to benchmarks to run docker containers. + +Abstraction for interacting with test machines. Machines are produced +by Machine producers and represent a local or remote machine. Benchmark +methods in /benchmarks/suite are passed the required number of machines in order +to run the benchmark. Machines contain methods to run commands via bash, +possibly over ssh. Machines also hold a connection to the docker UNIX socket +to run contianers. + + Typical usage example: + + machine = Machine() + machine.run(cmd) + machine.pull(path) + container = machine.container() +""" + +import logging +import re +import subprocess +import time +from typing import Tuple + +import docker + +from benchmarks import harness +from benchmarks.harness import container +from benchmarks.harness import machine_mocks +from benchmarks.harness import ssh_connection +from benchmarks.harness import tunnel_dispatcher + + +class Machine(object): + """The machine object is the primary object for benchmarks. + + Machine objects are passed to each metric function call and benchmarks use + machines to access real connections to those machines. + + Attributes: + _name: Name as a string + """ + _name = "" + + def run(self, cmd: str) -> Tuple[str, str]: + """Convenience method for running a bash command on a machine object. + + Some machines may point to the local machine, and thus, do not have ssh + connections. Run runs a command either local or over ssh and returns the + output stdout and stderr as strings. + + Args: + cmd: The command to run as a string. + + Returns: + The command output. + """ + raise NotImplementedError + + def read(self, path: str) -> str: + """Reads the contents of some file. + + This will be mocked. + + Args: + path: The path to the file to be read. + + Returns: + The file contents. + """ + raise NotImplementedError + + def pull(self, workload: str) -> str: + """Send the given workload to the machine, build and tag it. + + All images must be defined by the workloads directory. + + Args: + workload: The workload name. + + Returns: + The workload tag. + """ + raise NotImplementedError + + def container(self, image: str, **kwargs) -> container.Container: + """Returns a container object. + + Args: + image: The pulled image tag. + **kwargs: Additional container options. + + Returns: + :return: a container.Container object. + """ + raise NotImplementedError + + def sleep(self, amount: float): + """Sleeps the given amount of time.""" + time.sleep(amount) + + def __str__(self): + return self._name + + +class MockMachine(Machine): + """A mocked machine.""" + _name = "mock" + + def run(self, cmd: str) -> Tuple[str, str]: + return "", "" + + def read(self, path: str) -> str: + return machine_mocks.Readfile(path) + + def pull(self, workload: str) -> str: + return workload # Workload is the tag. + + def container(self, image: str, **kwargs) -> container.Container: + return container.MockContainer(image) + + def sleep(self, amount: float): + pass + + +def get_address(machine: Machine) -> str: + """Return a machine's default address.""" + default_route, _ = machine.run("ip route get 8.8.8.8") + return re.search(" src ([0-9.]+) ", default_route).group(1) + + +class LocalMachine(Machine): + """The local machine. + + Attributes: + _name: Name as a string + _docker_client: a pythonic connection to to the local dockerd unix socket. + See: https://github.com/docker/docker-py + """ + + def __init__(self, name): + self._name = name + self._docker_client = docker.from_env() + + def run(self, cmd: str) -> Tuple[str, str]: + process = subprocess.Popen( + cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + return stdout.decode("utf-8"), stderr.decode("utf-8") + + def read(self, path: str) -> str: + # Read the exact path locally. + return open(path, "r").read() + + def pull(self, workload: str) -> str: + # Run the docker build command locally. + logging.info("Building %s@%s locally...", workload, self._name) + self.run("docker build --tag={} {}".format( + workload, harness.LOCAL_WORKLOADS_PATH.format(workload))) + return workload # Workload is the tag. + + def container(self, image: str, **kwargs) -> container.Container: + # Return a local docker container directly. + return container.DockerContainer(self._docker_client, get_address(self), + image, **kwargs) + + def sleep(self, amount: float): + time.sleep(amount) + + +class RemoteMachine(Machine): + """Remote machine accessible via an SSH connection. + + Attributes: + _name: Name as a string + _ssh_connection: a paramiko backed ssh connection which can be used to run + commands on this machine + _tunnel: a python wrapper around a port forwarded ssh connection between a + local unix socket and the remote machine's dockerd unix socket. + _docker_client: a pythonic wrapper backed by the _tunnel. Allows sending + docker commands: see https://github.com/docker/docker-py + """ + + def __init__(self, name, **kwargs): + self._name = name + self._ssh_connection = ssh_connection.SSHConnection(name, **kwargs) + self._tunnel = tunnel_dispatcher.Tunnel(name, **kwargs) + self._tunnel.connect() + self._docker_client = self._tunnel.get_docker_client() + + def run(self, cmd: str) -> Tuple[str, str]: + return self._ssh_connection.run(cmd) + + def read(self, path: str) -> str: + # Just cat remotely. + stdout, stderr = self._ssh_connection.run("cat '{}'".format(path)) + return stdout + stderr + + def pull(self, workload: str) -> str: + # Push to the remote machine and build. + logging.info("Building %s@%s remotely...", workload, self._name) + remote_path = self._ssh_connection.send_workload(workload) + self.run("docker build --tag={} {}".format(workload, remote_path)) + return workload # Workload is the tag. + + def container(self, image: str, **kwargs) -> container.Container: + # Return a remote docker container. + return container.DockerContainer(self._docker_client, get_address(self), + image, **kwargs) + + def sleep(self, amount: float): + time.sleep(amount) diff --git a/benchmarks/harness/machine_mocks/BUILD b/benchmarks/harness/machine_mocks/BUILD new file mode 100644 index 000000000..c8ec4bc79 --- /dev/null +++ b/benchmarks/harness/machine_mocks/BUILD @@ -0,0 +1,9 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "machine_mocks", + srcs = ["__init__.py"], +) diff --git a/benchmarks/harness/machine_mocks/__init__.py b/benchmarks/harness/machine_mocks/__init__.py new file mode 100644 index 000000000..00f0085d7 --- /dev/null +++ b/benchmarks/harness/machine_mocks/__init__.py @@ -0,0 +1,81 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Machine mock files.""" + +MEMINFO = """\ +MemTotal: 7652344 kB +MemFree: 7174724 kB +MemAvailable: 7152008 kB +Buffers: 7544 kB +Cached: 178856 kB +SwapCached: 0 kB +Active: 270928 kB +Inactive: 68436 kB +Active(anon): 153124 kB +Inactive(anon): 880 kB +Active(file): 117804 kB +Inactive(file): 67556 kB +Unevictable: 0 kB +Mlocked: 0 kB +SwapTotal: 0 kB +SwapFree: 0 kB +Dirty: 900 kB +Writeback: 0 kB +AnonPages: 153000 kB +Mapped: 129120 kB +Shmem: 1044 kB +Slab: 60864 kB +SReclaimable: 22792 kB +SUnreclaim: 38072 kB +KernelStack: 2672 kB +PageTables: 5756 kB +NFS_Unstable: 0 kB +Bounce: 0 kB +WritebackTmp: 0 kB +CommitLimit: 3826172 kB +Committed_AS: 663836 kB +VmallocTotal: 34359738367 kB +VmallocUsed: 0 kB +VmallocChunk: 0 kB +HardwareCorrupted: 0 kB +AnonHugePages: 0 kB +ShmemHugePages: 0 kB +ShmemPmdMapped: 0 kB +CmaTotal: 0 kB +CmaFree: 0 kB +HugePages_Total: 0 +HugePages_Free: 0 +HugePages_Rsvd: 0 +HugePages_Surp: 0 +Hugepagesize: 2048 kB +DirectMap4k: 94196 kB +DirectMap2M: 4624384 kB +DirectMap1G: 3145728 kB +""" + +CONTENTS = { + "/proc/meminfo": MEMINFO, +} + + +def Readfile(path: str) -> str: + """Reads a mock file. + + Args: + path: The target path. + + Returns: + Mocked file contents or None. + """ + return CONTENTS.get(path, None) diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD new file mode 100644 index 000000000..a48da02a1 --- /dev/null +++ b/benchmarks/harness/machine_producers/BUILD @@ -0,0 +1,40 @@ +load("//benchmarks:defs.bzl", "py_library", "requirement") + +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "harness", + srcs = ["__init__.py"], +) + +py_library( + name = "machine_producer", + srcs = ["machine_producer.py"], +) + +py_library( + name = "mock_producer", + srcs = ["mock_producer.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/harness/machine_producers:machine_producer", + ], +) + +py_library( + name = "yaml_producer", + srcs = ["yaml_producer.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/harness/machine_producers:machine_producer", + requirement("PyYAML", False), + ], +) + +py_library( + name = "gcloud_mock_recorder", + srcs = ["gcloud_mock_recorder.py"], +) diff --git a/benchmarks/harness/machine_producers/__init__.py b/benchmarks/harness/machine_producers/__init__.py new file mode 100644 index 000000000..634ef4843 --- /dev/null +++ b/benchmarks/harness/machine_producers/__init__.py @@ -0,0 +1,13 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/benchmarks/harness/machine_producers/gcloud_mock_recorder.py b/benchmarks/harness/machine_producers/gcloud_mock_recorder.py new file mode 100644 index 000000000..fd9837a37 --- /dev/null +++ b/benchmarks/harness/machine_producers/gcloud_mock_recorder.py @@ -0,0 +1,97 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A recorder and replay for testing the GCloudProducer. + +MockPrinter and MockReader handle printing and reading mock data for the +purposes of testing. MockPrinter is passed to GCloudProducer objects. The user +can then run scenarios and record them for playback in tests later. + +MockReader is passed to MockGcloudProducer objects and handles reading the +previously recorded mock data. + +It is left to the user to check if data printed is properly redacted for their +own use. The intended usecase for this class is data coming from gcloud +commands, which will contain public IPs and other instance data. + +The data format is json and printed/read from the ./test_data directory. The +data is the output of subprocess.CompletedProcess objects in json format. + + Typical usage example: + + recorder = MockPrinter() + producer = GCloudProducer(args, recorder) + machines = producer.get_machines(1) + with open("my_file.json") as fd: + recorder.write_out(fd) + + reader = MockReader(filename) + producer = MockGcloudProducer(args, mock) + machines = producer.get_machines(1) + assert len(machines) == 1 +""" + +import io +import json +import subprocess + + +class MockPrinter(object): + """Handles printing Mock data for MockGcloudProducer. + + Attributes: + _records: list of json object records for printing + """ + + def __init__(self): + self._records = [] + + def record(self, entry: subprocess.CompletedProcess): + """Records data and strips out ip addresses.""" + + record = { + "args": entry.args, + "stdout": entry.stdout.decode("utf-8"), + "returncode": str(entry.returncode) + } + self._records.append(record) + + def write_out(self, fd: io.FileIO): + """Prints out the data into the given filepath.""" + fd.write(json.dumps(self._records, indent=4)) + + +class MockReader(object): + """Handles reading Mock data for MockGcloudProducer. + + Attributes: + _records: List[json] records read from the passed in file. + """ + + def __init__(self, filepath: str): + with open(filepath, "rb") as file: + self._records = json.loads(file.read()) + self._i = 0 + + def __iter__(self): + return self + + def __next__(self, args) -> subprocess.CompletedProcess: + """Returns the next record as a CompletedProcess.""" + if self._i < len(self._records): + record = self._records[self._i] + stdout = record["stdout"].encode("ascii") + returncode = int(record["returncode"]) + return subprocess.CompletedProcess( + args=args, returncode=returncode, stdout=stdout, stderr=b"") + raise StopIteration() diff --git a/benchmarks/harness/machine_producers/machine_producer.py b/benchmarks/harness/machine_producers/machine_producer.py new file mode 100644 index 000000000..124ee14cc --- /dev/null +++ b/benchmarks/harness/machine_producers/machine_producer.py @@ -0,0 +1,30 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Abstract types.""" + +from typing import List + +from benchmarks.harness import machine + + +class MachineProducer: + """Abstract Machine producer.""" + + def get_machines(self, num_machines: int) -> List[machine.Machine]: + """Returns the requested number of machines.""" + raise NotImplementedError + + def release_machines(self, machine_list: List[machine.Machine]): + """Releases the given set of machines.""" + raise NotImplementedError diff --git a/benchmarks/harness/machine_producers/mock_producer.py b/benchmarks/harness/machine_producers/mock_producer.py new file mode 100644 index 000000000..4f29ad53f --- /dev/null +++ b/benchmarks/harness/machine_producers/mock_producer.py @@ -0,0 +1,31 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Producers of mocks.""" + +from typing import List + +from benchmarks.harness import machine +from benchmarks.harness.machine_producers import machine_producer + + +class MockMachineProducer(machine_producer.MachineProducer): + """Produces MockMachine objects.""" + + def get_machines(self, num_machines: int) -> List[machine.MockMachine]: + """Returns the request number of MockMachines.""" + return [machine.MockMachine() for i in range(num_machines)] + + def release_machines(self, machine_list: List[machine.MockMachine]): + """No-op.""" + return diff --git a/benchmarks/harness/machine_producers/yaml_producer.py b/benchmarks/harness/machine_producers/yaml_producer.py new file mode 100644 index 000000000..5d334e480 --- /dev/null +++ b/benchmarks/harness/machine_producers/yaml_producer.py @@ -0,0 +1,106 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Producers based on yaml files.""" + +import os +import threading +from typing import Dict +from typing import List + +import yaml + +from benchmarks.harness import machine +from benchmarks.harness.machine_producers import machine_producer + + +class YamlMachineProducer(machine_producer.MachineProducer): + """Loads machines from a yaml file.""" + + def __init__(self, path: str): + self.machines = build_machines(path) + self.max_machines = len(self.machines) + self.machine_condition = threading.Condition() + + def get_machines(self, num_machines: int) -> List[machine.Machine]: + if num_machines > self.max_machines: + raise ValueError( + "Insufficient Ammount of Machines. {ask} asked for and have {max_num} max." + .format(ask=num_machines, max_num=self.max_machines)) + + with self.machine_condition: + while not self._enough_machines(num_machines): + self.machine_condition.wait(timeout=1) + return [self.machines.pop(0) for _ in range(num_machines)] + + def release_machines(self, machine_list: List[machine.Machine]): + with self.machine_condition: + while machine_list: + next_machine = machine_list.pop() + self.machines.append(next_machine) + self.machine_condition.notify() + + def _enough_machines(self, ask: int): + return ask <= len(self.machines) + + +def build_machines(path: str, num_machines: str = -1) -> List[machine.Machine]: + """Builds machine objects defined by the yaml file "path". + + Args: + path: The path to a yaml file which defines machines. + num_machines: Optional limit on how many machine objects to build. + + Returns: + Machine objects in a list. + + If num_machines is set, len(machines) <= num_machines. + """ + data = parse_yaml(path) + machines = [] + for key, value in data.items(): + if len(machines) == num_machines: + return machines + if isinstance(value, dict): + machines.append(machine.RemoteMachine(key, **value)) + else: + machines.append(machine.LocalMachine(key)) + return machines + + +def parse_yaml(path: str) -> Dict[str, Dict[str, str]]: + """Parse the yaml file pointed by path. + + Args: + path: The path to yaml file. + + Returns: + The contents of the yaml file as a dictionary. + """ + data = get_file_contents(path) + return yaml.load(data, Loader=yaml.Loader) + + +def get_file_contents(path: str) -> str: + """Dumps the file contents to a string and returns them. + + Args: + path: The path to dump. + + Returns: + The file contents as a string. + """ + if not os.path.isabs(path): + path = os.path.abspath(path) + with open(path) as input_file: + return input_file.read() diff --git a/benchmarks/harness/ssh_connection.py b/benchmarks/harness/ssh_connection.py new file mode 100644 index 000000000..fcbfbcdb2 --- /dev/null +++ b/benchmarks/harness/ssh_connection.py @@ -0,0 +1,111 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""SSHConnection handles the details of SSH connections.""" + +import os +import warnings + +import paramiko + +from benchmarks import harness + +# Get rid of paramiko Cryptography Warnings. +warnings.filterwarnings(action="ignore", module=".*paramiko.*") + + +def send_one_file(client: paramiko.SSHClient, path: str, remote_dir: str): + """Sends a single file via an SSH client. + + Args: + client: The existing SSH client. + path: The local path. + remote_dir: The remote directory. + """ + filename = path.split("/").pop() + client.exec_command("mkdir -p " + remote_dir) + with client.open_sftp() as ftp_client: + ftp_client.put(path, os.path.join(remote_dir, filename)) + + +class SSHConnection: + """SSH connection to a remote machine.""" + + def __init__(self, name: str, hostname: str, key_path: str, username: str, + **kwargs): + """Sets up a paramiko ssh connection to the given hostname.""" + self._name = name # Unused. + self._hostname = hostname + self._username = username + self._key_path = key_path # RSA Key path + self._kwargs = kwargs + # SSHConnection wraps paramiko. paramiko supports RSA, ECDSA, and Ed25519 + # keys, and we've chosen to only suport and require RSA keys. paramiko + # supports RSA keys that begin with '----BEGIN RSAKEY----'. + # https://stackoverflow.com/questions/53600581/ssh-key-generated-by-ssh-keygen-is-not-recognized-by-paramiko + self.rsa_key = self._rsa() + self.run("true") # Validate. + + def _client(self) -> paramiko.SSHClient: + """Returns a connected SSH client.""" + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + client.connect( + hostname=self._hostname, + port=22, + username=self._username, + pkey=self.rsa_key, + allow_agent=False, + look_for_keys=False) + return client + + def _rsa(self): + if "key_password" in self._kwargs: + password = self._kwargs["key_password"] + else: + password = None + rsa = paramiko.RSAKey.from_private_key_file(self._key_path, password) + return rsa + + def run(self, cmd: str) -> (str, str): + """Runs a command via ssh. + + Args: + cmd: The shell command to run. + + Returns: + The contents of stdout and stderr. + """ + with self._client() as client: + _, stdout, stderr = client.exec_command(command=cmd) + stdout.channel.recv_exit_status() + stdout = stdout.read().decode("utf-8") + stderr = stderr.read().decode("utf-8") + return stdout, stderr + + def send_workload(self, name: str) -> str: + """Sends a workload to the remote machine. + + Args: + name: The workload name. + + Returns: + The remote path. + """ + with self._client() as client: + for dirpath, _, filenames in os.walk( + harness.LOCAL_WORKLOADS_PATH.format(name)): + for filename in filenames: + send_one_file(client, os.path.join(dirpath, filename), + harness.REMOTE_WORKLOADS_PATH.format(name)) + return harness.REMOTE_WORKLOADS_PATH.format(name) diff --git a/benchmarks/harness/tunnel_dispatcher.py b/benchmarks/harness/tunnel_dispatcher.py new file mode 100644 index 000000000..c56fd022a --- /dev/null +++ b/benchmarks/harness/tunnel_dispatcher.py @@ -0,0 +1,122 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tunnel handles setting up connections to remote machines. + +Tunnel dispatcher is a wrapper around the connection from a local UNIX socket +and a remote UNIX socket via SSH with port forwarding. This is done to +initialize the pythonic dockerpy client to run containers on the remote host by +connecting to /var/run/docker.sock (where Docker is listening). Tunnel +dispatcher sets up the local UNIX socket and calls the `ssh` command as a +subprocess, and holds a reference to that subprocess. It manages clean-up on +exit as best it can by killing the ssh subprocess and deleting the local UNIX +socket,stored in /tmp for easy cleanup in most systems if this fails. + + Typical usage example: + + t = Tunnel(name, **kwargs) + t.connect() + client = t.get_docker_client() # + client.containers.run("ubuntu", "echo hello world") + +""" + +import os +import tempfile +import time + +import docker +import pexpect + +SSH_TUNNEL_COMMAND = """ssh + -o GlobalKnownHostsFile=/dev/null + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + -o IdentitiesOnly=yes + -nNT -L {filename}:/var/run/docker.sock + -i {key_path} + {username}@{hostname}""" + + +class Tunnel(object): + """The tunnel object represents the tunnel via ssh. + + This connects a local unix domain socket with a remote socket. + + Attributes: + _filename: a temporary name of the UNIX socket prefixed by the name + argument. + _hostname: the IP or resolvable hostname of the remote host. + _username: the username of the ssh_key used to run ssh. + _key_path: path to a valid key. + _key_password: optional password to the ssh key in _key_path + _process: holds reference to the ssh subprocess created. + + Returns: + The new minimum port. + + Raises: + ConnectionError: If no available port is found. + """ + + def __init__(self, + name: str, + hostname: str, + username: str, + key_path: str, + key_password: str = "", + **kwargs): + self._filename = tempfile.NamedTemporaryFile(prefix=name).name + self._hostname = hostname + self._username = username + self._key_path = key_path + self._key_password = key_password + self._kwargs = kwargs + self._process = None + + def connect(self): + """Connects the SSH tunnel and stores the subprocess reference in _process.""" + cmd = SSH_TUNNEL_COMMAND.format( + filename=self._filename, + key_path=self._key_path, + username=self._username, + hostname=self._hostname) + self._process = pexpect.spawn(cmd, timeout=10) + + # If given a password, assume we'll be asked for it. + if self._key_password: + self._process.expect(["Enter passphrase for key .*: "]) + self._process.sendline(self._key_password) + + while True: + # Wait for the tunnel to appear. + if self._process.exitstatus is not None: + raise ConnectionError("Error in setting up ssh tunnel") + if os.path.exists(self._filename): + return + time.sleep(0.1) + + def path(self): + """Return the socket file.""" + return self._filename + + def get_docker_client(self): + """Returns a docker client for this Tunnel.""" + return docker.DockerClient(base_url="unix:/" + self._filename) + + def __del__(self): + """Closes the ssh connection process and deletes the socket file.""" + if self._process: + self._process.close() + if os.path.exists(self._filename): + os.remove(self._filename) diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt new file mode 100644 index 000000000..577eb1a2e --- /dev/null +++ b/benchmarks/requirements.txt @@ -0,0 +1,32 @@ +asn1crypto==1.2.0 +atomicwrites==1.3.0 +attrs==19.3.0 +bcrypt==3.1.7 +certifi==2019.9.11 +cffi==1.13.2 +chardet==3.0.4 +Click==7.0 +cryptography==2.8 +docker==3.7.0 +docker-pycreds==0.4.0 +idna==2.8 +importlib-metadata==0.23 +more-itertools==7.2.0 +packaging==19.2 +paramiko==2.6.0 +pathlib2==2.3.5 +pexpect==4.7.0 +pluggy==0.9.0 +ptyprocess==0.6.0 +py==1.8.0 +pycparser==2.19 +PyNaCl==1.3.0 +pyparsing==2.4.5 +pytest==4.3.0 +PyYAML==5.1.2 +requests==2.22.0 +six==1.13.0 +urllib3==1.25.7 +wcwidth==0.1.7 +websocket-client==0.56.0 +zipp==0.6.0 diff --git a/benchmarks/run.py b/benchmarks/run.py new file mode 100644 index 000000000..a22eb8641 --- /dev/null +++ b/benchmarks/run.py @@ -0,0 +1,19 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Benchmark runner.""" + +from benchmarks import runner + +if __name__ == "__main__": + runner.runner() diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD new file mode 100644 index 000000000..de24824cc --- /dev/null +++ b/benchmarks/runner/BUILD @@ -0,0 +1,53 @@ +load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") + +package(licenses = ["notice"]) + +py_library( + name = "runner", + srcs = ["__init__.py"], + data = [ + "//benchmarks/workloads:files", + ], + visibility = ["//benchmarks:__pkg__"], + deps = [ + "//benchmarks/harness:benchmark_driver", + "//benchmarks/harness/machine_producers:mock_producer", + "//benchmarks/harness/machine_producers:yaml_producer", + "//benchmarks/suites", + "//benchmarks/suites:absl", + "//benchmarks/suites:density", + "//benchmarks/suites:fio", + "//benchmarks/suites:helpers", + "//benchmarks/suites:http", + "//benchmarks/suites:media", + "//benchmarks/suites:ml", + "//benchmarks/suites:network", + "//benchmarks/suites:redis", + "//benchmarks/suites:startup", + "//benchmarks/suites:sysbench", + "//benchmarks/suites:syscall", + requirement("click", True), + ], +) + +py_test( + name = "runner_test", + srcs = ["runner_test.py"], + python_version = "PY3", + tags = [ + "local", + "manual", + ], + deps = [ + ":runner", + requirement("click", True), + requirement("attrs", False), + requirement("atomicwrites", False), + requirement("more-itertools", False), + requirement("pathlib2", False), + requirement("pluggy", False), + requirement("py", False), + requirement("pytest", True), + requirement("six", False), + ], +) diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py new file mode 100644 index 000000000..9bf9cfd65 --- /dev/null +++ b/benchmarks/runner/__init__.py @@ -0,0 +1,301 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""High-level benchmark utility.""" + +import copy +import csv +import logging +import pkgutil +import pydoc +import re +import sys +import types +from typing import List +from typing import Tuple + +import click + +from benchmarks import suites +from benchmarks.harness import benchmark_driver +from benchmarks.harness.machine_producers import mock_producer +from benchmarks.harness.machine_producers import yaml_producer + + +@click.group() +@click.option( + "--verbose/--no-verbose", default=False, help="Enable verbose logging.") +@click.option("--debug/--no-debug", default=False, help="Enable debug logging.") +def runner(verbose: bool = False, debug: bool = False): + """Run distributed benchmarks. + + See the run and list commands for details. + + Args: + verbose: Enable verbose logging. + debug: Enable debug logging (supercedes verbose). + """ + if debug: + logging.basicConfig(level=logging.DEBUG) + elif verbose: + logging.basicConfig(level=logging.INFO) + + +def find_benchmarks( + regex: str) -> List[Tuple[str, types.ModuleType, types.FunctionType]]: + """Finds all available benchmarks. + + Args: + regex: A regular expression to match. + + Returns: + A (short_name, module, function) tuple for each match. + """ + pkgs = pkgutil.walk_packages(suites.__path__, suites.__name__ + ".") + found = [] + for _, name, _ in pkgs: + mod = pydoc.locate(name) + funcs = [ + getattr(mod, x) + for x in dir(mod) + if suites.is_benchmark(getattr(mod, x)) + ] + for func in funcs: + # Use the short_name with the benchmarks. prefix stripped. + prefix_len = len(suites.__name__ + ".") + short_name = mod.__name__[prefix_len:] + "." + func.__name__ + # Add to the list if a pattern is provided. + if re.compile(regex).match(short_name): + found.append((short_name, mod, func)) + return found + + +@runner.command("list") +@click.argument("method", nargs=-1) +def list_all(method): + """Lists available benchmarks.""" + if not method: + method = ".*" + else: + method = "(" + ",".join(method) + ")" + for (short_name, _, func) in find_benchmarks(method): + print("Benchmark %s:" % short_name) + metrics = suites.benchmark_metrics(func) + if func.__doc__: + print(" " + func.__doc__.lstrip().rstrip()) + if metrics: + print("\n Metrics:") + for metric in metrics: + print("\t{name}: {doc}".format(name=metric[0], doc=metric[1])) + print("\n") + + +# pylint: disable=too-many-arguments +# pylint: disable=too-many-branches +# pylint: disable=too-many-locals +@runner.command( + context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) +@click.pass_context +@click.argument("method") +@click.option("--mock/--no-mock", default=False, help="Mock the machines.") +@click.option("--env", default=None, help="Specify a yaml file with machines.") +@click.option( + "--runtime", default=["runc"], help="The runtime to use.", multiple=True) +@click.option("--metric", help="The metric to extract.", multiple=True) +@click.option( + "--runs", default=1, help="The number of times to run each benchmark.") +@click.option( + "--stat", + default="median", + help="How to aggregate the data from all runs." + "\nmedian - returns the median of all runs (default)" + "\nall - returns all results comma separated" + "\nmeanstd - returns result as mean,std") +# pylint: disable=too-many-statements +def run(ctx, method: str, runs: int, env: str, mock: bool, runtime: List[str], + metric: List[str], stat: str, **kwargs): + """Runs arbitrary benchmarks. + + All unknown command line flags are passed through to the underlying benchmark + method. Flags may be specified multiple times, in which case it is considered + a "dimension" for the test, and a comma-separated table will be emitted + instead of a single result. + + See the output of list to see available metrics for any given benchmark + method. The method parameter is a regular expression that will match against + available benchmarks. If multiple benchmarks match, then that is considered a + distinct "dimension" for the test. + + All benchmarks are run in parallel where possible, but have exclusive + ownership over the individual machines. + + Exactly one of the --mock and --env flag must be specified. + + Every benchmark method will be run the times indicated by --runs. + + Args: + ctx: Click context. + method: A regular expression for methods to be run. + runs: Number of runs. + env: Environment to use. + mock: If true, use mocked environment (supercedes env). + runtime: A list of runtimes to test. + metric: A list of metrics to extract. + stat: The class of statistics to extract. + **kwargs: Dimensions to test. + """ + # First, calculate additional arguments. + # + # This essentially calculates any arguments that appear multiple times, and + # moves those to the "dimensions" dictionary, which maps to lists. These + # dimensions are then iterated over to generate the relevant csv output. + dimensions = {} + + if stat not in ["median", "all", "meanstd"]: + raise ValueError("Illegal value for --result, see help.") + + def squish(key: str, value: str): + """Collapse an argument into kwargs or dimensions.""" + if key in dimensions: + # Extend an existing dimension. + dimensions[key].append(value) + elif key in kwargs: + # Create a new dimension. + dimensions[key] = [kwargs[key], value] + del kwargs[key] + else: + # A single value. + kwargs[key] = value + + for item in ctx.args: + if "=" in method: + # This must be the method. The method is simply set to the first + # non-matching argument, which we're also parsing here. + item, method = method, item + if "=" not in item: + logging.error("illegal argument: %s", item) + sys.exit(1) + (key, value) = item.lstrip("-").split("=", 1) + squish(key, value) + + # Convert runtime and metric to dimensions. + # + # They exist only in the arguments above for documentation purposes. + # Essentially here we are treating them like anything else. Note however, + # that an empty set here will result in a dimension. This is important for + # metrics, where an empty set actually means all metrics. + def fold(key: str, value, allow_flatten=False): + """Collapse a list value into kwargs or dimensions.""" + if len(value) == 1 and allow_flatten: + kwargs[key] = value[0] + else: + dimensions[key] = value + + fold("runtime", runtime, allow_flatten=True) + fold("metric", metric) + + # Lookup the methods. + # + # We match the method parameter to a regular expression. This allows you to + # do things like `run --mock .*` for a broad test. Note that we track the + # short_names in the dimensions here, and look up again in the recursion. + methods = { + short_name: func for (short_name, _, func) in find_benchmarks(method) + } + if not methods: + # Must match at least one method. + logging.error("no matching benchmarks for %s: try list.", method) + sys.exit(1) + fold("method", list(methods.keys()), allow_flatten=True) + + # Construct the environment. + if mock and env: + # You can't provide both. + logging.error("both --mock and --env are set: which one is it?") + sys.exit(1) + elif mock: + producer = mock_producer.MockMachineProducer() + elif env: + producer = yaml_producer.YamlMachineProducer(env) + else: + # You must provide one of mock or env. + logging.error("no enviroment provided: use --mock or --env.") + sys.exit(1) + + # Spin up the drivers. + # + # We ensure that metric is the last entry, because we have special behavior. + # They actually run the test once and the benchmark is a generator that + # produces all viable metrics. + dimension_keys = list(dimensions.keys()) + if "metric" in dimension_keys: + dimension_keys.remove("metric") + dimension_keys.append("metric") + drivers = [] + + def _start(keywords, finished, left): + """Runs a test across dimensions recursively.""" + # Resolve the method fully, it starts as a string. + if "method" in keywords and isinstance(keywords["method"], str): + keywords["method"] = methods[keywords["method"]] + # Is this a non-recursive case? + if not left: + driver = benchmark_driver.BenchmarkDriver(producer, runs=runs, **keywords) + driver.start() + drivers.append((finished, driver)) + else: + # Recurse on the next dimension. + current, left = left[0], left[1:] + keywords = copy.deepcopy(keywords) + if current == "metric": + # We use a generator, popped below. Note that metric is + # guaranteed to be the last element here, and we will provide + # the value for 'done' below when generating the csv. + keywords[current] = dimensions[current] + _start(keywords, finished, left) + else: + # Generate manually. + for value in dimensions[current]: + keywords[current] = value + _start(keywords, finished + [value], left) + + # Start all the drivers, recursively. + _start(kwargs, [], dimension_keys) + + # Finish all tests, write results. + output = csv.writer(sys.stdout) + output.writerow(dimension_keys + ["result"]) + for (done, driver) in drivers: + driver.join() + for (metric_name, result) in getattr(driver, stat)(): + output.writerow([ # Collapse the method name. + hasattr(x, "__name__") and x.__name__ or x for x in done + ] + [metric_name] + result) + + +@runner.command() +@click.argument("env") +@click.option( + "--cmd", default="uname -a", help="command to run on all found machines") +@click.option( + "--workload", default="true", help="workload to run all found machines") +def validate(env, cmd, workload): + """Validates an environment described by yaml file.""" + producer = yaml_producer.YamlMachineProducer(env) + for machine in producer.machines: + print("Machine %s:" % machine) + stdout, _ = machine.run(cmd) + print(" Output of '%s': %s" % (cmd, stdout.lstrip().rstrip())) + image = machine.pull(workload) + stdout = machine.container(image).run() + print(" Container %s: %s" % (workload, stdout.lstrip().rstrip())) diff --git a/benchmarks/runner/runner_test.py b/benchmarks/runner/runner_test.py new file mode 100644 index 000000000..5719c2838 --- /dev/null +++ b/benchmarks/runner/runner_test.py @@ -0,0 +1,59 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Top-level tests.""" + +import os +import subprocess +import sys + +from click import testing +import pytest + +from benchmarks import runner + + +def _get_locale(): + output = subprocess.check_output(["locale", "-a"]) + locales = output.split() + if b"en_US.utf8" in locales: + return "en_US.UTF-8" + else: + return "C.UTF-8" + + +def _set_locale(): + locale = _get_locale() + if os.getenv("LANG") != locale: + os.environ["LANG"] = locale + os.environ["LC_ALL"] = locale + os.execv("/proc/self/exe", ["python"] + sys.argv) + + +def test_list(): + cli_runner = testing.CliRunner() + result = cli_runner.invoke(runner.runner, ["list"]) + print(result.output) + assert result.exit_code == 0 + + +def test_run(): + cli_runner = testing.CliRunner() + result = cli_runner.invoke(runner.runner, ["run", "--mock", "."]) + print(result.output) + assert result.exit_code == 0 + + +if __name__ == "__main__": + _set_locale() + sys.exit(pytest.main([__file__])) diff --git a/benchmarks/suites/BUILD b/benchmarks/suites/BUILD new file mode 100644 index 000000000..04fc23261 --- /dev/null +++ b/benchmarks/suites/BUILD @@ -0,0 +1,130 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "suites", + srcs = ["__init__.py"], +) + +py_library( + name = "absl", + srcs = ["absl.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/suites", + "//benchmarks/workloads/absl", + ], +) + +py_library( + name = "density", + srcs = ["density.py"], + deps = [ + "//benchmarks/harness:container", + "//benchmarks/harness:machine", + "//benchmarks/suites", + "//benchmarks/suites:helpers", + ], +) + +py_library( + name = "fio", + srcs = ["fio.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/suites", + "//benchmarks/suites:helpers", + "//benchmarks/workloads/fio", + ], +) + +py_library( + name = "helpers", + srcs = ["helpers.py"], + deps = ["//benchmarks/harness:machine"], +) + +py_library( + name = "http", + srcs = ["http.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/suites", + "//benchmarks/workloads/ab", + ], +) + +py_library( + name = "media", + srcs = ["media.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/suites", + "//benchmarks/suites:helpers", + "//benchmarks/workloads/ffmpeg", + ], +) + +py_library( + name = "ml", + srcs = ["ml.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/suites", + "//benchmarks/suites:startup", + "//benchmarks/workloads/tensorflow", + ], +) + +py_library( + name = "network", + srcs = ["network.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/suites", + "//benchmarks/suites:helpers", + "//benchmarks/workloads/iperf", + ], +) + +py_library( + name = "redis", + srcs = ["redis.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/suites", + "//benchmarks/workloads/redisbenchmark", + ], +) + +py_library( + name = "startup", + srcs = ["startup.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/suites", + "//benchmarks/suites:helpers", + ], +) + +py_library( + name = "sysbench", + srcs = ["sysbench.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/suites", + "//benchmarks/workloads/sysbench", + ], +) + +py_library( + name = "syscall", + srcs = ["syscall.py"], + deps = [ + "//benchmarks/harness:machine", + "//benchmarks/suites", + "//benchmarks/workloads/syscall", + ], +) diff --git a/benchmarks/suites/__init__.py b/benchmarks/suites/__init__.py new file mode 100644 index 000000000..360736cc3 --- /dev/null +++ b/benchmarks/suites/__init__.py @@ -0,0 +1,119 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Core benchmark annotations.""" + +import functools +import inspect +import types +from typing import List +from typing import Tuple + +BENCHMARK_METRICS = '__benchmark_metrics__' +BENCHMARK_MACHINES = '__benchmark_machines__' + + +def is_benchmark(func: types.FunctionType) -> bool: + """Returns true if the given function is a benchmark.""" + return isinstance(func, types.FunctionType) and \ + hasattr(func, BENCHMARK_METRICS) and \ + hasattr(func, BENCHMARK_MACHINES) + + +def benchmark_metrics(func: types.FunctionType) -> List[Tuple[str, str]]: + """Returns the list of available metrics.""" + return [(metric.__name__, metric.__doc__) + for metric in getattr(func, BENCHMARK_METRICS)] + + +def benchmark_machines(func: types.FunctionType) -> int: + """Returns the number of machines required.""" + return getattr(func, BENCHMARK_MACHINES) + + +# pylint: disable=unused-argument +def default(value, **kwargs): + """Returns the passed value.""" + return value + + +def benchmark(metrics: List[types.FunctionType] = None, + machines: int = 1) -> types.FunctionType: + """Define a benchmark function with metrics. + + Args: + metrics: A list of metric functions. + machines: The number of machines required. + + Returns: + A function that accepts the given number of machines, and iteratively + returns a set of (metric_name, metric_value) pairs when called repeatedly. + """ + if not metrics: + # The default passes through. + metrics = [default] + + def decorator(func: types.FunctionType) -> types.FunctionType: + """Decorator function.""" + # Every benchmark should accept at least two parameters: + # runtime: The runtime to use for the benchmark (str, required). + # metrics: The metrics to use, if not the default (str, optional). + @functools.wraps(func) + def wrapper(*args, runtime: str, metric: list = None, **kwargs): + """Wrapper function.""" + # First -- ensure that we marshall all types appropriately. In + # general, we will call this with only strings. These strings will + # need to be converted to their underlying types/classes. + sig = inspect.signature(func) + for param in sig.parameters.values(): + if param.annotation != inspect.Parameter.empty and \ + param.name in kwargs and not isinstance(kwargs[param.name], param.annotation): + try: + # Marshall to the appropriate type. + kwargs[param.name] = param.annotation(kwargs[param.name]) + except Exception as exc: + raise ValueError( + 'illegal type for %s(%s=%s): %s' % + (func.__name__, param.name, kwargs[param.name], exc)) + elif param.default != inspect.Parameter.empty and \ + param.name not in kwargs: + # Ensure that we have the value set, because it will + # be passed to the metric function for evaluation. + kwargs[param.name] = param.default + + # Next, figure out how to apply a metric. We do this prior to + # running the underlying function to prevent having to wait a few + # minutes for a result just to see some error. + if not metric: + # Return all metrics in the iterator. + result = func(*args, runtime=runtime, **kwargs) + for metric_func in metrics: + yield (metric_func.__name__, metric_func(result, **kwargs)) + else: + result = None + for single_metric in metric: + for metric_func in metrics: + # Is this a function that matches the name? + # Apply this function to the result. + if metric_func.__name__ == single_metric: + if not result: + # Lazy evaluation: only if metric matches. + result = func(*args, runtime=runtime, **kwargs) + yield single_metric, metric_func(result, **kwargs) + + # Set metadata on the benchmark (used above). + setattr(wrapper, BENCHMARK_METRICS, metrics) + setattr(wrapper, BENCHMARK_MACHINES, machines) + return wrapper + + return decorator diff --git a/benchmarks/suites/absl.py b/benchmarks/suites/absl.py new file mode 100644 index 000000000..5d9b57a09 --- /dev/null +++ b/benchmarks/suites/absl.py @@ -0,0 +1,37 @@ +# python3 +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""absl build benchmark.""" + +from benchmarks import suites +from benchmarks.harness import machine +from benchmarks.workloads import absl + + +@suites.benchmark(metrics=[absl.elapsed_time], machines=1) +def build(target: machine.Machine, **kwargs) -> str: + """Runs the absl workload and report the absl build time. + + Runs the 'bazel build //absl/...' in a clean bazel directory and + monitors time elapsed. + + Args: + target: A machine object. + **kwargs: Additional container options. + + Returns: + Container output. + """ + image = target.pull("absl") + return target.container(image, **kwargs).run() diff --git a/benchmarks/suites/density.py b/benchmarks/suites/density.py new file mode 100644 index 000000000..89d29fb26 --- /dev/null +++ b/benchmarks/suites/density.py @@ -0,0 +1,121 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Density tests.""" + +import re +import types + +from benchmarks import suites +from benchmarks.harness import container +from benchmarks.harness import machine +from benchmarks.suites import helpers + + +# pylint: disable=unused-argument +def memory_usage(value, **kwargs): + """Returns the passed value.""" + return value + + +def density(target: machine.Machine, + workload: str, + count: int = 50, + wait: float = 0, + load_func: types.FunctionType = None, + **kwargs): + """Calculate the average memory usage per container. + + Args: + target: A machine object. + workload: The workload to run. + count: The number of containers to start. + wait: The time to wait after starting. + load_func: Callback that is called after count images have been started on + the given machine. + **kwargs: Additional container options. + + Returns: + The average usage in Kb per container. + """ + count = int(count) + + # Drop all caches. + helpers.drop_caches(target) + before = target.read("/proc/meminfo") + + # Load the workload. + image = target.pull(workload) + + with target.container( + image=image, count=count, **kwargs).detach() as containers: + # Call the optional load function callback if given. + if load_func: + load_func(target, containers) + # Wait 'wait' time before taking a measurement. + target.sleep(wait) + + # Drop caches again. + helpers.drop_caches(target) + after = target.read("/proc/meminfo") + + # Calculate the memory used. + available_re = re.compile(r"MemAvailable:\s*(\d+)\skB\n") + before_available = available_re.findall(before) + after_available = available_re.findall(after) + return 1024 * float(int(before_available[0]) - + int(after_available[0])) / float(count) + + +def load_redis(target: machine.Machine, containers: container.Container): + """Use redis-benchmark "LPUSH" to load each container with 1G of data. + + Args: + target: A machine object. + containers: A set of containers. + """ + target.pull("redisbenchmark") + for name in containers.get_names(): + flags = "-d 10000 -t LPUSH" + target.container( + "redisbenchmark", links={ + name: name + }).run( + host=name, flags=flags) + + +@suites.benchmark(metrics=[memory_usage], machines=1) +def empty(target: machine.Machine, **kwargs) -> float: + """Run trivial containers in a density test.""" + return density(target, workload="sleep", wait=1.0, **kwargs) + + +@suites.benchmark(metrics=[memory_usage], machines=1) +def node(target: machine.Machine, **kwargs) -> float: + """Run node containers in a density test.""" + return density(target, workload="node", wait=3.0, **kwargs) + + +@suites.benchmark(metrics=[memory_usage], machines=1) +def ruby(target: machine.Machine, **kwargs) -> float: + """Run ruby containers in a density test.""" + return density(target, workload="ruby", wait=3.0, **kwargs) + + +@suites.benchmark(metrics=[memory_usage], machines=1) +def redis(target: machine.Machine, **kwargs) -> float: + """Run redis containers in a density test.""" + if "count" not in kwargs: + kwargs["count"] = 5 + return density( + target, workload="redis", wait=3.0, load_func=load_redis, **kwargs) diff --git a/benchmarks/suites/fio.py b/benchmarks/suites/fio.py new file mode 100644 index 000000000..2171790c5 --- /dev/null +++ b/benchmarks/suites/fio.py @@ -0,0 +1,165 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""File I/O tests.""" + +import os + +from benchmarks import suites +from benchmarks.harness import machine +from benchmarks.suites import helpers +from benchmarks.workloads import fio + + +# pylint: disable=too-many-arguments +# pylint: disable=too-many-locals +def run_fio(target: machine.Machine, + test: str, + ioengine: str = "sync", + size: int = 1024 * 1024 * 1024, + iodepth: int = 4, + blocksize: int = 1024 * 1024, + time: int = -1, + mount_dir: str = "", + filename: str = "file.dat", + tmpfs: bool = False, + ramp_time: int = 0, + **kwargs) -> str: + """FIO benchmarks. + + For more on fio see: + https://media.readthedocs.org/pdf/fio/latest/fio.pdf + + Args: + target: A machine object. + test: The test to run (read, write, randread, randwrite, etc.) + ioengine: The engine for I/O. + size: The size of the generated file in bytes (if an integer) or 5g, 16k, + etc. + iodepth: The I/O for certain engines. + blocksize: The blocksize for reads and writes in bytes (if an integer) or + 4k, etc. + time: If test is time based, how long to run in seconds. + mount_dir: The absolute path on the host to mount a bind mount. + filename: The name of the file to creat inside container. For a path of + /dir/dir/file, the script setup a volume like 'docker run -v + mount_dir:/dir/dir fio' and fio will create (and delete) the file + /dir/dir/file. If tmpfs is set, this /dir/dir will be a tmpfs. + tmpfs: If true, mount on tmpfs. + ramp_time: The time to run before recording statistics + **kwargs: Additional container options. + + Returns: + The output of fio as a string. + """ + # Pull the image before dropping caches. + image = target.pull("fio") + + if not mount_dir: + stdout, _ = target.run("pwd") + mount_dir = stdout.rstrip() + + # Setup the volumes. + volumes = {mount_dir: {"bind": "/disk", "mode": "rw"}} if not tmpfs else None + tmpfs = {"/disk": ""} if tmpfs else None + + # Construct a file in the volume. + filepath = os.path.join("/disk", filename) + + # If we are running a read test, us fio to write a file and then flush file + # data from memory. + if "read" in test: + target.container( + image, volumes=volumes, tmpfs=tmpfs, **kwargs).run( + test="write", + ioengine="sync", + size=size, + iodepth=iodepth, + blocksize=blocksize, + path=filepath) + helpers.drop_caches(target) + + # Run the test. + time_str = "--time_base --runtime={time}".format( + time=time) if int(time) > 0 else "" + res = target.container( + image, volumes=volumes, tmpfs=tmpfs, **kwargs).run( + test=test, + ioengine=ioengine, + size=size, + iodepth=iodepth, + blocksize=blocksize, + time=time_str, + path=filepath, + ramp_time=ramp_time) + + target.run( + "rm {path}".format(path=os.path.join(mount_dir.rstrip(), filename))) + + return res + + +@suites.benchmark(metrics=[fio.read_bandwidth, fio.read_io_ops], machines=1) +def read(*args, **kwargs): + """Read test. + + Args: + *args: None. + **kwargs: Additional container options. + + Returns: + The output of fio. + """ + return run_fio(*args, test="read", **kwargs) + + +@suites.benchmark(metrics=[fio.read_bandwidth, fio.read_io_ops], machines=1) +def randread(*args, **kwargs): + """Random read test. + + Args: + *args: None. + **kwargs: Additional container options. + + Returns: + The output of fio. + """ + return run_fio(*args, test="randread", **kwargs) + + +@suites.benchmark(metrics=[fio.write_bandwidth, fio.write_io_ops], machines=1) +def write(*args, **kwargs): + """Write test. + + Args: + *args: None. + **kwargs: Additional container options. + + Returns: + The output of fio. + """ + return run_fio(*args, test="write", **kwargs) + + +@suites.benchmark(metrics=[fio.write_bandwidth, fio.write_io_ops], machines=1) +def randwrite(*args, **kwargs): + """Random write test. + + Args: + *args: None. + **kwargs: Additional container options. + + Returns: + The output of fio. + """ + return run_fio(*args, test="randwrite", **kwargs) diff --git a/benchmarks/suites/helpers.py b/benchmarks/suites/helpers.py new file mode 100644 index 000000000..b3c7360ab --- /dev/null +++ b/benchmarks/suites/helpers.py @@ -0,0 +1,57 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Benchmark helpers.""" + +import datetime +from benchmarks.harness import machine + + +class Timer: + """Helper to time runtime of some call. + + Usage: + + with Timer as t: + # do something. + t.get_time_in_seconds() + """ + + def __init__(self): + self._start = datetime.datetime.now() + + def __enter__(self): + self.start() + return self + + def start(self): + """Starts the timer.""" + self._start = datetime.datetime.now() + + def elapsed(self) -> float: + """Returns the elapsed time in seconds.""" + return (datetime.datetime.now() - self._start).total_seconds() + + def __exit__(self, exception_type, exception_value, exception_traceback): + pass + + +def drop_caches(target: machine.Machine): + """Drops caches on the machine. + + Args: + target: A machine object. + """ + target.run("sudo sync") + target.run("sudo sysctl vm.drop_caches=3") + target.run("sudo sysctl vm.drop_caches=3") diff --git a/benchmarks/suites/http.py b/benchmarks/suites/http.py new file mode 100644 index 000000000..ea9024e43 --- /dev/null +++ b/benchmarks/suites/http.py @@ -0,0 +1,138 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""HTTP benchmarks.""" + +from benchmarks import suites +from benchmarks.harness import machine +from benchmarks.workloads import ab + + +# pylint: disable=too-many-arguments +def http(server: machine.Machine, + client: machine.Machine, + workload: str, + requests: int = 5000, + connections: int = 10, + port: int = 80, + path: str = "notfound", + **kwargs) -> str: + """Run apachebench (ab) against an http server. + + Args: + server: A machine object. + client: A machine object. + workload: The http-serving workload. + requests: Number of requests to send the server. Default is 5000. + connections: Number of concurent connections to use. Default is 10. + port: The port to access in benchmarking. + path: File to download, generally workload-specific. + **kwargs: Additional container options. + + Returns: + The full apachebench output. + """ + # Pull the client & server. + apachebench = client.pull("ab") + netcat = client.pull("netcat") + image = server.pull(workload) + + with server.container(image, port=port, **kwargs).detach() as container: + (host, port) = container.address() + # Wait for the server to come up. + client.container(netcat).run(host=host, port=port) + # Run the benchmark, no arguments. + return client.container(apachebench).run( + host=host, + port=port, + requests=requests, + connections=connections, + path=path) + + +# pylint: disable=too-many-arguments +# pylint: disable=too-many-locals +def http_app(server: machine.Machine, + client: machine.Machine, + workload: str, + requests: int = 5000, + connections: int = 10, + port: int = 80, + path: str = "notfound", + **kwargs) -> str: + """Run apachebench (ab) against an http application. + + Args: + server: A machine object. + client: A machine object. + workload: The http-serving workload. + requests: Number of requests to send the server. Default is 5000. + connections: Number of concurent connections to use. Default is 10. + port: The port to use for benchmarking. + path: File to download, generally workload-specific. + **kwargs: Additional container options. + + Returns: + The full apachebench output. + """ + # Pull the client & server. + apachebench = client.pull("ab") + netcat = client.pull("netcat") + server_netcat = server.pull("netcat") + redis = server.pull("redis") + image = server.pull(workload) + redis_port = 6379 + redis_name = "redis_server" + + with server.container(redis, name=redis_name).detach(): + server.container(server_netcat, links={redis_name: redis_name})\ + .run(host=redis_name, port=redis_port) + with server.container(image, port=port, links={redis_name: redis_name}, **kwargs)\ + .detach(host=redis_name) as container: + (host, port) = container.address() + # Wait for the server to come up. + client.container(netcat).run(host=host, port=port) + # Run the benchmark, no arguments. + return client.container(apachebench).run( + host=host, + port=port, + requests=requests, + connections=connections, + path=path) + + +@suites.benchmark(metrics=[ab.transfer_rate, ab.latency], machines=2) +def httpd(*args, **kwargs) -> str: + """Apache2 benchmark.""" + return http(*args, workload="httpd", port=80, **kwargs) + + +@suites.benchmark( + metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2) +def nginx(*args, **kwargs) -> str: + """Nginx benchmark.""" + return http(*args, workload="nginx", port=80, **kwargs) + + +@suites.benchmark( + metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2) +def node(*args, **kwargs) -> str: + """Node benchmark.""" + return http_app(*args, workload="node_template", path="", port=8080, **kwargs) + + +@suites.benchmark( + metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2) +def ruby(*args, **kwargs) -> str: + """Ruby benchmark.""" + return http_app(*args, workload="ruby_template", path="", port=9292, **kwargs) diff --git a/benchmarks/suites/media.py b/benchmarks/suites/media.py new file mode 100644 index 000000000..9cbffdaa1 --- /dev/null +++ b/benchmarks/suites/media.py @@ -0,0 +1,42 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Media processing benchmarks.""" + +from benchmarks import suites +from benchmarks.harness import machine +from benchmarks.suites import helpers +from benchmarks.workloads import ffmpeg + + +@suites.benchmark(metrics=[ffmpeg.run_time], machines=1) +def transcode(target: machine.Machine, **kwargs) -> float: + """Runs a video transcoding workload and times it. + + Args: + target: A machine object. + **kwargs: Additional container options. + + Returns: + Total workload runtime. + """ + # Load before timing. + image = target.pull("ffmpeg") + + # Drop caches. + helpers.drop_caches(target) + + # Time startup + transcoding. + with helpers.Timer() as timer: + target.container(image, **kwargs).run() + return timer.elapsed() diff --git a/benchmarks/suites/ml.py b/benchmarks/suites/ml.py new file mode 100644 index 000000000..a394d1f69 --- /dev/null +++ b/benchmarks/suites/ml.py @@ -0,0 +1,33 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Machine Learning tests.""" + +from benchmarks import suites +from benchmarks.harness import machine +from benchmarks.suites import startup +from benchmarks.workloads import tensorflow + + +@suites.benchmark(metrics=[tensorflow.run_time], machines=1) +def train(target: machine.Machine, **kwargs): + """Run the tensorflow benchmark and return the runtime in seconds of workload. + + Args: + target: A machine object. + **kwargs: Additional container options. + + Returns: + The total runtime. + """ + return startup.startup(target, workload="tensorflow", count=1, **kwargs) diff --git a/benchmarks/suites/network.py b/benchmarks/suites/network.py new file mode 100644 index 000000000..f973cf3f1 --- /dev/null +++ b/benchmarks/suites/network.py @@ -0,0 +1,101 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Network microbenchmarks.""" + +from typing import Dict + +from benchmarks import suites +from benchmarks.harness import machine +from benchmarks.suites import helpers +from benchmarks.workloads import iperf + + +def run_iperf(client: machine.Machine, + server: machine.Machine, + client_kwargs: Dict[str, str] = None, + server_kwargs: Dict[str, str] = None) -> str: + """Measure iperf performance. + + Args: + client: A machine object. + server: A machine object. + client_kwargs: Additional client container options. + server_kwargs: Additional server container options. + + Returns: + The output of iperf. + """ + if not client_kwargs: + client_kwargs = dict() + if not server_kwargs: + server_kwargs = dict() + + # Pull images. + netcat = client.pull("netcat") + iperf_client_image = client.pull("iperf") + iperf_server_image = server.pull("iperf") + + # Set this due to a bug in the kernel that resets connections. + client.run("sudo /sbin/sysctl -w net.netfilter.nf_conntrack_tcp_be_liberal=1") + server.run("sudo /sbin/sysctl -w net.netfilter.nf_conntrack_tcp_be_liberal=1") + + with server.container( + iperf_server_image, port=5001, **server_kwargs).detach() as iperf_server: + (host, port) = iperf_server.address() + # Wait until the service is available. + client.container(netcat).run(host=host, port=port) + # Run a warm-up run. + client.container( + iperf_client_image, stderr=True, **client_kwargs).run( + host=host, port=port) + # Run the client with relevant arguments. + res = client.container(iperf_client_image, stderr=True, **client_kwargs)\ + .run(host=host, port=port) + helpers.drop_caches(client) + return res + + +@suites.benchmark(metrics=[iperf.bandwidth], machines=2) +def upload(client: machine.Machine, server: machine.Machine, **kwargs) -> str: + """Measure upload performance. + + Args: + client: A machine object. + server: A machine object. + **kwargs: Client container options. + + Returns: + The output of iperf. + """ + if kwargs["runtime"] == "runc": + kwargs["network_mode"] = "host" + return run_iperf(client, server, client_kwargs=kwargs) + + +@suites.benchmark(metrics=[iperf.bandwidth], machines=2) +def download(client: machine.Machine, server: machine.Machine, **kwargs) -> str: + """Measure download performance. + + Args: + client: A machine object. + server: A machine object. + **kwargs: Server container options. + + Returns: + The output of iperf. + """ + + client_kwargs = {"network_mode": "host"} + return run_iperf( + client, server, client_kwargs=client_kwargs, server_kwargs=kwargs) diff --git a/benchmarks/suites/redis.py b/benchmarks/suites/redis.py new file mode 100644 index 000000000..b84dd073d --- /dev/null +++ b/benchmarks/suites/redis.py @@ -0,0 +1,46 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Redis benchmarks.""" + +from benchmarks import suites +from benchmarks.harness import machine +from benchmarks.workloads import redisbenchmark + + +@suites.benchmark(metrics=list(redisbenchmark.METRICS.values()), machines=2) +def redis(server: machine.Machine, + client: machine.Machine, + flags: str = "", + **kwargs) -> str: + """Run redis-benchmark on client pointing at server machine. + + Args: + server: A machine object. + client: A machine object. + flags: Flags to pass redis-benchmark. + **kwargs: Additional container options. + + Returns: + Output from redis-benchmark. + """ + redis_server = server.pull("redis") + redis_client = client.pull("redisbenchmark") + netcat = client.pull("netcat") + with server.container( + redis_server, port=6379, **kwargs).detach() as container: + (host, port) = container.address() + # Wait for the container to be up. + client.container(netcat).run(host=host, port=port) + # Run all redis benchmarks. + return client.container(redis_client).run(host=host, port=port, flags=flags) diff --git a/benchmarks/suites/startup.py b/benchmarks/suites/startup.py new file mode 100644 index 000000000..a1b6c5753 --- /dev/null +++ b/benchmarks/suites/startup.py @@ -0,0 +1,110 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Start-up benchmarks.""" + +from benchmarks import suites +from benchmarks.harness import machine +from benchmarks.suites import helpers + + +# pylint: disable=unused-argument +def startup_time_ms(value, **kwargs): + """Returns average startup time per container in milliseconds. + + Args: + value: The floating point time in seconds. + **kwargs: Ignored. + + Returns: + The time given in milliseconds. + """ + return value * 1000 + + +def startup(target: machine.Machine, + workload: str, + count: int = 5, + port: int = 0, + **kwargs): + """Time the startup of some workload. + + Args: + target: A machine object. + workload: The workload to run. + count: Number of containers to start. + port: The port to check for liveness, if provided. + **kwargs: Additional container options. + + Returns: + The mean start-up time in seconds. + """ + # Load before timing. + image = target.pull(workload) + netcat = target.pull("netcat") + count = int(count) + port = int(port) + + with helpers.Timer() as timer: + for _ in range(count): + if not port: + # Run the container synchronously. + target.container(image, **kwargs).run() + else: + # Run a detached container until httpd available. + with target.container(image, port=port, **kwargs).detach() as server: + (server_host, server_port) = server.address() + target.container(netcat).run(host=server_host, port=server_port) + return timer.elapsed() / float(count) + + +@suites.benchmark(metrics=[startup_time_ms], machines=1) +def empty(target: machine.Machine, **kwargs) -> float: + """Time the startup of a trivial container. + + Args: + target: A machine object. + **kwargs: Additional startup options. + + Returns: + The time to run the container. + """ + return startup(target, workload="true", **kwargs) + + +@suites.benchmark(metrics=[startup_time_ms], machines=1) +def node(target: machine.Machine, **kwargs) -> float: + """Time the startup of the node container. + + Args: + target: A machine object. + **kwargs: Additional statup options. + + Returns: + The time to run the container. + """ + return startup(target, workload="node", port=8080, **kwargs) + + +@suites.benchmark(metrics=[startup_time_ms], machines=1) +def ruby(target: machine.Machine, **kwargs) -> float: + """Time the startup of the ruby container. + + Args: + target: A machine object. + **kwargs: Additional startup options. + + Returns: + The time to run the container. + """ + return startup(target, workload="ruby", port=3000, **kwargs) diff --git a/benchmarks/suites/sysbench.py b/benchmarks/suites/sysbench.py new file mode 100644 index 000000000..2a6e2126c --- /dev/null +++ b/benchmarks/suites/sysbench.py @@ -0,0 +1,119 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Sysbench-based benchmarks.""" + +from benchmarks import suites +from benchmarks.harness import machine +from benchmarks.workloads import sysbench + + +def run_sysbench(target: machine.Machine, + test: str = "cpu", + threads: int = 8, + time: int = 5, + options: str = "", + **kwargs) -> str: + """Run sysbench container with arguments. + + Args: + target: A machine object. + test: Relevant sysbench test to run (e.g. cpu, memory). + threads: The number of threads to use for tests. + time: The time to run tests. + options: Additional sysbench options. + **kwargs: Additional container options. + + Returns: + The output of the command as a string. + """ + image = target.pull("sysbench") + return target.container(image, **kwargs).run( + test=test, threads=threads, time=time, options=options) + + +@suites.benchmark(metrics=[sysbench.cpu_events_per_second], machines=1) +def cpu(target: machine.Machine, max_prime: int = 5000, **kwargs) -> str: + """Run sysbench CPU test. + + Additional arguments can be provided for sysbench. + + Args: + target: A machine object. + max_prime: The maximum prime number to search. + **kwargs: + - threads: The number of threads to use for tests. + - time: The time to run tests. + - options: Additional sysbench options. See sysbench tool: + https://github.com/akopytov/sysbench + + Returns: + Sysbench output. + """ + options = kwargs.pop("options", "") + options += " --cpu-max-prime={}".format(max_prime) + return run_sysbench(target, test="cpu", options=options, **kwargs) + + +@suites.benchmark(metrics=[sysbench.memory_ops_per_second], machines=1) +def memory(target: machine.Machine, **kwargs) -> str: + """Run sysbench memory test. + + Additional arguments can be provided per sysbench. + + Args: + target: A machine object. + **kwargs: + - threads: The number of threads to use for tests. + - time: The time to run tests. + - options: Additional sysbench options. See sysbench tool: + https://github.com/akopytov/sysbench + + Returns: + Sysbench output. + """ + return run_sysbench(target, test="memory", **kwargs) + + +@suites.benchmark( + metrics=[ + sysbench.mutex_time, sysbench.mutex_latency, sysbench.mutex_deviation + ], + machines=1) +def mutex(target: machine.Machine, + locks: int = 4, + count: int = 10000000, + threads: int = 8, + **kwargs) -> str: + """Run sysbench mutex test. + + Additional arguments can be provided per sysbench. + + Args: + target: A machine object. + locks: The number of locks to use. + count: The number of mutexes. + threads: The number of threads to use for tests. + **kwargs: + - time: The time to run tests. + - options: Additional sysbench options. See sysbench tool: + https://github.com/akopytov/sysbench + + Returns: + Sysbench output. + """ + options = kwargs.pop("options", "") + options += " --mutex-loops=1 --mutex-locks={} --mutex-num={}".format( + count, locks) + return run_sysbench( + target, test="mutex", options=options, threads=threads, **kwargs) diff --git a/benchmarks/suites/syscall.py b/benchmarks/suites/syscall.py new file mode 100644 index 000000000..fa7665b00 --- /dev/null +++ b/benchmarks/suites/syscall.py @@ -0,0 +1,37 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Syscall microbenchmark.""" + +from benchmarks import suites +from benchmarks.harness import machine +from benchmarks.workloads.syscall import syscall_time_ns + + +@suites.benchmark(metrics=[syscall_time_ns], machines=1) +def syscall(target: machine.Machine, count: int = 1000000, **kwargs) -> str: + """Runs the syscall workload and report the syscall time. + + Runs the syscall 'SYS_gettimeofday(0,0)' 'count' times and monitors time + elapsed based on the runtime's MONOTONIC clock. + + Args: + target: A machine object. + count: The number of syscalls to execute. + **kwargs: Additional container options. + + Returns: + Container output. + """ + image = target.pull("syscall") + return target.container(image, **kwargs).run(count=count) diff --git a/benchmarks/tcp/BUILD b/benchmarks/tcp/BUILD new file mode 100644 index 000000000..735d7127f --- /dev/null +++ b/benchmarks/tcp/BUILD @@ -0,0 +1,41 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("@rules_cc//cc:defs.bzl", "cc_binary") + +package(licenses = ["notice"]) + +go_binary( + name = "tcp_proxy", + srcs = ["tcp_proxy.go"], + visibility = ["//:sandbox"], + deps = [ + "//pkg/tcpip", + "//pkg/tcpip/adapters/gonet", + "//pkg/tcpip/link/fdbased", + "//pkg/tcpip/network/arp", + "//pkg/tcpip/network/ipv4", + "//pkg/tcpip/stack", + "//pkg/tcpip/transport/tcp", + "//pkg/tcpip/transport/udp", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +# nsjoin is a trivial replacement for nsenter. This is used because nsenter is +# not available on all systems where this benchmark is run (and we aim to +# minimize external dependencies.) + +cc_binary( + name = "nsjoin", + srcs = ["nsjoin.c"], + visibility = ["//:sandbox"], +) + +sh_binary( + name = "tcp_benchmark", + srcs = ["tcp_benchmark.sh"], + data = [ + ":nsjoin", + ":tcp_proxy", + ], + visibility = ["//:sandbox"], +) diff --git a/benchmarks/tcp/README.md b/benchmarks/tcp/README.md new file mode 100644 index 000000000..38e6e69f0 --- /dev/null +++ b/benchmarks/tcp/README.md @@ -0,0 +1,87 @@ +# TCP Benchmarks + +This directory contains a standardized TCP benchmark. This helps to evaluate the +performance of netstack and native networking stacks under various conditions. + +## `tcp_benchmark` + +This benchmark allows TCP throughput testing under various conditions. The setup +consists of an iperf client, a client proxy, a server proxy and an iperf server. +The client proxy and server proxy abstract the network mechanism used to +communicate between the iperf client and server. + +The setup looks like the following: + +``` + +--------------+ (native) +--------------+ + | iperf client |[lo @ 10.0.0.1]------>| client proxy | + +--------------+ +--------------+ + [client.0 @ 10.0.0.2] + (netstack) | | (native) + +------+-----+ + | + [br0] + | + Network emulation applied ---> [wan.0:wan.1] + | + [br1] + | + +------+-----+ + (netstack) | | (native) + [server.0 @ 10.0.0.3] + +--------------+ +--------------+ + | iperf server |<------[lo @ 10.0.0.4]| server proxy | + +--------------+ (native) +--------------+ +``` + +Different configurations can be run using different arguments. For example: + +* Native test under normal internet conditions: `tcp_benchmark` +* Native test under ideal conditions: `tcp_benchmark --ideal` +* Netstack client under ideal conditions: `tcp_benchmark --client --ideal` +* Netstack client with 5% packet loss: `tcp_benchmark --client --ideal --loss + 5` + +Use `tcp_benchmark --help` for full arguments. + +This tool may be used to easily generate data for graphing. For example, to +generate a CSV for various latencies, you might do: + +``` +rm -f /tmp/netstack_latency.csv /tmp/native_latency.csv +latencies=$(seq 0 5 50; + seq 60 10 100; + seq 125 25 250; + seq 300 50 500) +for latency in $latencies; do + read throughput client_cpu server_cpu <<< \ + $(./tcp_benchmark --duration 30 --client --ideal --latency $latency) + echo $latency,$throughput,$client_cpu >> /tmp/netstack_latency.csv +done +for latency in $latencies; do + read throughput client_cpu server_cpu <<< \ + $(./tcp_benchmark --duration 30 --ideal --latency $latency) + echo $latency,$throughput,$client_cpu >> /tmp/native_latency.csv +done +``` + +Similarly, to generate a CSV for various levels of packet loss, the following +would be appropriate: + +``` +rm -f /tmp/netstack_loss.csv /tmp/native_loss.csv +losses=$(seq 0 0.1 1.0; + seq 1.2 0.2 2.0; + seq 2.5 0.5 5.0; + seq 6.0 1.0 10.0) +for loss in $losses; do + read throughput client_cpu server_cpu <<< \ + $(./tcp_benchmark --duration 30 --client --ideal --latency 10 --loss $loss) + echo $loss,$throughput,$client_cpu >> /tmp/netstack_loss.csv +done +for loss in $losses; do + read throughput client_cpu server_cpu <<< \ + $(./tcp_benchmark --duration 30 --ideal --latency 10 --loss $loss) + echo $loss,$throughput,$client_cpu >> /tmp/native_loss.csv +done +``` diff --git a/benchmarks/tcp/nsjoin.c b/benchmarks/tcp/nsjoin.c new file mode 100644 index 000000000..524b4d549 --- /dev/null +++ b/benchmarks/tcp/nsjoin.c @@ -0,0 +1,47 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <errno.h> +#include <fcntl.h> +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +int main(int argc, char** argv) { + if (argc <= 2) { + fprintf(stderr, "error: must provide a namespace file.\n"); + fprintf(stderr, "usage: %s <file> [arguments...]\n", argv[0]); + return 1; + } + + int fd = open(argv[1], O_RDONLY); + if (fd < 0) { + fprintf(stderr, "error opening %s: %s\n", argv[1], strerror(errno)); + return 1; + } + if (setns(fd, 0) < 0) { + fprintf(stderr, "error joining %s: %s\n", argv[1], strerror(errno)); + return 1; + } + + execvp(argv[2], &argv[2]); + return 1; +} diff --git a/benchmarks/tcp/tcp_benchmark.sh b/benchmarks/tcp/tcp_benchmark.sh new file mode 100755 index 000000000..69344c9c3 --- /dev/null +++ b/benchmarks/tcp/tcp_benchmark.sh @@ -0,0 +1,369 @@ +#!/bin/bash + +# Copyright 2018 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TCP benchmark; see README.md for documentation. + +# Fixed parameters. +iperf_port=45201 # Not likely to be privileged. +proxy_port=44000 # Ditto. +client_addr=10.0.0.1 +client_proxy_addr=10.0.0.2 +server_proxy_addr=10.0.0.3 +server_addr=10.0.0.4 +mask=8 + +# Defaults; this provides a reasonable approximation of a decent internet link. +# Parameters can be varied independently from this set to see response to +# various changes in the kind of link available. +client=false +server=false +verbose=false +gso=0 +swgso=false +mtu=1280 # 1280 is a reasonable lowest-common-denominator. +latency=10 # 10ms approximates a fast, dedicated connection. +latency_variation=1 # +/- 1ms is a relatively low amount of jitter. +loss=0.1 # 0.1% loss is non-zero, but not extremely high. +duplicate=0.1 # 0.1% means duplicates are 1/10x as frequent as losses. +duration=30 # 30s is enough time to consistent results (experimentally). +helper_dir=$(dirname $0) +netstack_opts= + +# Check for netem support. +lsmod_output=$(lsmod | grep sch_netem) +if [ "$?" != "0" ]; then + echo "warning: sch_netem may not be installed." >&2 +fi + +while [ $# -gt 0 ]; do + case "$1" in + --client) + client=true + ;; + --client_tcp_probe_file) + shift + netstack_opts="${netstack_opts} -client_tcp_probe_file=$1" + ;; + --server) + server=true + ;; + --verbose) + verbose=true + ;; + --gso) + shift + gso=$1 + ;; + --swgso) + swgso=true + ;; + --server_tcp_probe_file) + shift + netstack_opts="${netstack_opts} -server_tcp_probe_file=$1" + ;; + --ideal) + mtu=1500 # Standard ethernet. + latency=0 # No latency. + latency_variation=0 # No jitter. + loss=0 # No loss. + duplicate=0 # No duplicates. + ;; + --mtu) + shift + [ "$#" -le 0 ] && echo "no mtu provided" && exit 1 + mtu=$1 + ;; + --sack) + netstack_opts="${netstack_opts} -sack" + ;; + --cubic) + netstack_opts="${netstack_opts} -cubic" + ;; + --duration) + shift + [ "$#" -le 0 ] && echo "no duration provided" && exit 1 + duration=$1 + ;; + --latency) + shift + [ "$#" -le 0 ] && echo "no latency provided" && exit 1 + latency=$1 + ;; + --latency-variation) + shift + [ "$#" -le 0 ] && echo "no latency variation provided" && exit 1 + latency_variation=$1 + ;; + --loss) + shift + [ "$#" -le 0 ] && echo "no loss probability provided" && exit 1 + loss=$1 + ;; + --duplicate) + shift + [ "$#" -le 0 ] && echo "no duplicate provided" && exit 1 + duplicate=$1 + ;; + --cpuprofile) + shift + netstack_opts="${netstack_opts} -cpuprofile=$1" + ;; + --memprofile) + shift + netstack_opts="${netstack_opts} -memprofile=$1" + ;; + --helpers) + shift + [ "$#" -le 0 ] && echo "no helper dir provided" && exit 1 + helper_dir=$1 + ;; + *) + echo "usage: $0 [options]" + echo "options:" + echo " --help show this message" + echo " --verbose verbose output" + echo " --client use netstack as the client" + echo " --ideal reset all network emulation" + echo " --server use netstack as the server" + echo " --mtu set the mtu (bytes)" + echo " --sack enable SACK support" + echo " --cubic enable CUBIC congestion control for Netstack" + echo " --duration set the test duration (s)" + echo " --latency set the latency (ms)" + echo " --latency-variation set the latency variation" + echo " --loss set the loss probability (%)" + echo " --duplicate set the duplicate probability (%)" + echo " --helpers set the helper directory" + echo "" + echo "The output will of the script will be:" + echo " <throughput> <client-cpu-usage> <server-cpu-usage>" + exit 1 + esac + shift +done + +if [ ${verbose} == "true" ]; then + set -x +fi + +# Latency needs to be halved, since it's applied on both ways. +half_latency=$(echo ${latency}/2 | bc -l | awk '{printf "%1.2f", $0}') +half_loss=$(echo ${loss}/2 | bc -l | awk '{printf "%1.6f", $0}') +half_duplicate=$(echo ${duplicate}/2 | bc -l | awk '{printf "%1.6f", $0}') +helper_dir=${helper_dir#$(pwd)/} # Use relative paths. +proxy_binary=${helper_dir}/tcp_proxy +nsjoin_binary=${helper_dir}/nsjoin + +if [ ! -e ${proxy_binary} ]; then + echo "Could not locate ${proxy_binary}, please make sure you've built the binary" + exit 1 +fi + +if [ ! -e ${nsjoin_binary} ]; then + echo "Could not locate ${nsjoin_binary}, please make sure you've built the binary" + exit 1 +fi + +if [ $(echo ${latency_variation} | awk '{printf "%1.2f", $0}') != "0.00" ]; then + # As long as there's some jitter, then we use the paretonormal distribution. + # This will preserve the minimum RTT, but add a realistic amount of jitter to + # the connection and cause re-ordering, etc. The regular pareto distribution + # appears to an unreasonable level of delay (we want only small spikes.) + distribution="distribution paretonormal" +else + distribution="" +fi + +# Client proxy that will listen on the client's iperf target forward traffic +# using the host networking stack. +client_args="${proxy_binary} -port ${proxy_port} -forward ${server_proxy_addr}:${proxy_port}" +if ${client}; then + # Client proxy that will listen on the client's iperf target + # and forward traffic using netstack. + client_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -client \\ + -mtu ${mtu} -iface client.0 -addr ${client_proxy_addr} -mask ${mask} \\ + -forward ${server_proxy_addr}:${proxy_port} -gso=${gso} -swgso=${swgso}" +fi + +# Server proxy that will listen on the proxy port and forward to the server's +# iperf server using the host networking stack. +server_args="${proxy_binary} -port ${proxy_port} -forward ${server_addr}:${iperf_port}" +if ${server}; then + # Server proxy that will listen on the proxy port and forward to the servers' + # iperf server using netstack. + server_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -server \\ + -mtu ${mtu} -iface server.0 -addr ${server_proxy_addr} -mask ${mask} \\ + -forward ${server_addr}:${iperf_port} -gso=${gso} -swgso=${swgso}" +fi + +# Specify loss and duplicate parameters only if they are non-zero +loss_opt="" +if [ "$(echo $half_loss | bc -q)" != "0" ]; then + loss_opt="loss random ${half_loss}%" +fi +duplicate_opt="" +if [ "$(echo $half_duplicate | bc -q)" != "0" ]; then + duplicate_opt="duplicate ${half_duplicate}%" +fi + +exec unshare -U -m -n -r -f -p --mount-proc /bin/bash << EOF +set -e -m + +if [ ${verbose} == "true" ]; then + set -x +fi + +mount -t tmpfs netstack-bench /tmp + +# We may have reset the path in the unshare if the shell loaded some public +# profiles. Ensure that tools are discoverable via the parent's PATH. +export PATH=${PATH} + +# Add client, server interfaces. +ip link add client.0 type veth peer name client.1 +ip link add server.0 type veth peer name server.1 + +# Add network emulation devices. +ip link add wan.0 type veth peer name wan.1 +ip link set wan.0 up +ip link set wan.1 up + +# Enroll on the bridge. +ip link add name br0 type bridge +ip link add name br1 type bridge +ip link set client.1 master br0 +ip link set server.1 master br1 +ip link set wan.0 master br0 +ip link set wan.1 master br1 +ip link set br0 up +ip link set br1 up + +# Set the MTU appropriately. +ip link set client.0 mtu ${mtu} +ip link set server.0 mtu ${mtu} +ip link set wan.0 mtu ${mtu} +ip link set wan.1 mtu ${mtu} + +# Add appropriate latency, loss and duplication. +# +# This is added in at the point of bridge connection. +for device in wan.0 wan.1; do + # NOTE: We don't support a loss correlation as testing has shown that it + # actually doesn't work. The man page actually has a small comment about this + # "It is also possible to add a correlation, but this option is now deprecated + # due to the noticed bad behavior." For more information see netem(8). + tc qdisc add dev \$device root netem \\ + delay ${half_latency}ms ${latency_variation}ms ${distribution} \\ + ${loss_opt} ${duplicate_opt} +done + +# Start a client proxy. +touch /tmp/client.netns +unshare -n mount --bind /proc/self/ns/net /tmp/client.netns + +# Move the endpoint into the namespace. +while ip link | grep client.0 > /dev/null; do + ip link set dev client.0 netns /tmp/client.netns +done + +if ! ${client}; then + # Only add the address to NIC if netstack is not in use. Otherwise the host + # will also process the inbound SYN and send a RST back. + ${nsjoin_binary} /tmp/client.netns ip addr add ${client_proxy_addr}/${mask} dev client.0 +fi + +# Start a server proxy. +touch /tmp/server.netns +unshare -n mount --bind /proc/self/ns/net /tmp/server.netns +# Move the endpoint into the namespace. +while ip link | grep server.0 > /dev/null; do + ip link set dev server.0 netns /tmp/server.netns +done +if ! ${server}; then + # Only add the address to NIC if netstack is not in use. Otherwise the host + # will also process the inbound SYN and send a RST back. + ${nsjoin_binary} /tmp/server.netns ip addr add ${server_proxy_addr}/${mask} dev server.0 +fi + +# Add client and server addresses, and bring everything up. +${nsjoin_binary} /tmp/client.netns ip addr add ${client_addr}/${mask} dev client.0 +${nsjoin_binary} /tmp/server.netns ip addr add ${server_addr}/${mask} dev server.0 +${nsjoin_binary} /tmp/client.netns ip link set client.0 up +${nsjoin_binary} /tmp/client.netns ip link set lo up +${nsjoin_binary} /tmp/server.netns ip link set server.0 up +${nsjoin_binary} /tmp/server.netns ip link set lo up +ip link set dev client.1 up +ip link set dev server.1 up + +${nsjoin_binary} /tmp/client.netns ${client_args} & +client_pid=\$! +${nsjoin_binary} /tmp/server.netns ${server_args} & +server_pid=\$! + +# Start the iperf server. +${nsjoin_binary} /tmp/server.netns iperf -p ${iperf_port} -s >&2 & +iperf_pid=\$! + +# Show traffic information. +if ! ${client} && ! ${server}; then + ${nsjoin_binary} /tmp/client.netns ping -c 100 -i 0.001 -W 1 ${server_addr} >&2 || true +fi + +results_file=\$(mktemp) +function cleanup { + rm -f \$results_file + kill -TERM \$client_pid + kill -TERM \$server_pid + wait \$client_pid + wait \$server_pid + kill -9 \$iperf_pid 2>/dev/null +} + +# Allow failure from this point. +set +e +trap cleanup EXIT + +# Run the benchmark, recording the results file. +while ${nsjoin_binary} /tmp/client.netns iperf \\ + -p ${proxy_port} -c ${client_addr} -t ${duration} -f m 2>&1 \\ + | tee \$results_file \\ + | grep "connect failed" >/dev/null; do + sleep 0.1 # Wait for all services. +done + +# Unlink all relevant devices from the bridge. This is because when the bridge +# is deleted, the kernel may hang. It appears that this problem is fixed in +# upstream commit 1ce5cce895309862d2c35d922816adebe094fe4a. +ip link set client.1 nomaster +ip link set server.1 nomaster +ip link set wan.0 nomaster +ip link set wan.1 nomaster + +# Emit raw results. +cat \$results_file >&2 + +# Emit a useful result (final throughput). +mbits=\$(grep Mbits/sec \$results_file \\ + | sed -n -e 's/^.*[[:space:]]\\([[:digit:]]\\+\\(\\.[[:digit:]]\\+\\)\\?\\)[[:space:]]*Mbits\\/sec.*/\\1/p') +client_cpu_ticks=\$(cat /proc/\$client_pid/stat \\ + | awk '{print (\$14+\$15);}') +server_cpu_ticks=\$(cat /proc/\$server_pid/stat \\ + | awk '{print (\$14+\$15);}') +ticks_per_sec=\$(getconf CLK_TCK) +client_cpu_load=\$(bc -l <<< \$client_cpu_ticks/\$ticks_per_sec/${duration}) +server_cpu_load=\$(bc -l <<< \$server_cpu_ticks/\$ticks_per_sec/${duration}) +echo \$mbits \$client_cpu_load \$server_cpu_load +EOF diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go new file mode 100644 index 000000000..361a56755 --- /dev/null +++ b/benchmarks/tcp/tcp_proxy.go @@ -0,0 +1,436 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Binary tcp_proxy is a simple TCP proxy. +package main + +import ( + "encoding/gob" + "flag" + "fmt" + "io" + "log" + "math/rand" + "net" + "os" + "os/signal" + "regexp" + "runtime" + "runtime/pprof" + "strconv" + "syscall" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet" + "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" + "gvisor.dev/gvisor/pkg/tcpip/network/arp" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" +) + +var ( + port = flag.Int("port", 0, "bind port (all addresses)") + forward = flag.String("forward", "", "forwarding target") + client = flag.Bool("client", false, "use netstack for listen") + server = flag.Bool("server", false, "use netstack for dial") + + // Netstack-specific options. + mtu = flag.Int("mtu", 1280, "mtu for network stack") + addr = flag.String("addr", "", "address for tap-based netstack") + mask = flag.Int("mask", 8, "mask size for address") + iface = flag.String("iface", "", "network interface name to bind for netstack") + sack = flag.Bool("sack", false, "enable SACK support for netstack") + cubic = flag.Bool("cubic", false, "enable use of CUBIC congestion control for netstack") + gso = flag.Int("gso", 0, "GSO maximum size") + swgso = flag.Bool("swgso", false, "software-level GSO") + clientTCPProbeFile = flag.String("client_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.") + serverTCPProbeFile = flag.String("server_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.") + cpuprofile = flag.String("cpuprofile", "", "write cpu profile to the specified file.") + memprofile = flag.String("memprofile", "", "write memory profile to the specified file.") +) + +type impl interface { + dial(address string) (net.Conn, error) + listen(port int) (net.Listener, error) + printStats() +} + +type netImpl struct{} + +func (netImpl) dial(address string) (net.Conn, error) { + return net.Dial("tcp", address) +} + +func (netImpl) listen(port int) (net.Listener, error) { + return net.Listen("tcp", fmt.Sprintf(":%d", port)) +} + +func (netImpl) printStats() { +} + +const ( + nicID = 1 // Fixed. + rcvBufSize = 1 << 20 // 1MB. +) + +type netstackImpl struct { + s *stack.Stack + addr tcpip.Address + mode string +} + +func setupNetwork(ifaceName string) (fd int, err error) { + // Get all interfaces in the namespace. + ifaces, err := net.Interfaces() + if err != nil { + return -1, fmt.Errorf("querying interfaces: %v", err) + } + + for _, iface := range ifaces { + if iface.Name != ifaceName { + continue + } + // Create the socket. + const protocol = 0x0300 // htons(ETH_P_ALL) + fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) + if err != nil { + return -1, fmt.Errorf("unable to create raw socket: %v", err) + } + + // Bind to the appropriate device. + ll := syscall.SockaddrLinklayer{ + Protocol: protocol, + Ifindex: iface.Index, + Pkttype: syscall.PACKET_HOST, + } + if err := syscall.Bind(fd, &ll); err != nil { + return -1, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) + } + + // RAW Sockets by default have a very small SO_RCVBUF of 256KB, + // up it to at least 1MB to reduce packet drops. + if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, rcvBufSize); err != nil { + return -1, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err) + } + + if !*swgso && *gso != 0 { + if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { + return -1, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) + } + } + return fd, nil + } + return -1, fmt.Errorf("failed to find interface: %v", ifaceName) +} + +func newNetstackImpl(mode string) (impl, error) { + fd, err := setupNetwork(*iface) + if err != nil { + return nil, err + } + + // Parse details. + parsedAddr := tcpip.Address(net.ParseIP(*addr).To4()) + parsedDest := tcpip.Address("") // Filled in below. + parsedMask := tcpip.AddressMask("") // Filled in below. + switch *mask { + case 8: + parsedDest = tcpip.Address([]byte{parsedAddr[0], 0, 0, 0}) + parsedMask = tcpip.AddressMask([]byte{0xff, 0, 0, 0}) + case 16: + parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], 0, 0}) + parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0, 0}) + case 24: + parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], parsedAddr[2], 0}) + parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0xff, 0}) + default: + // This is just laziness; we don't expect a different mask. + return nil, fmt.Errorf("mask %d not supported", mask) + } + + // Create a new network stack. + netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()} + transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol()} + s := stack.New(stack.Options{ + NetworkProtocols: netProtos, + TransportProtocols: transProtos, + }) + + // Generate a new mac for the eth device. + mac := make(net.HardwareAddr, 6) + rand.Read(mac) // Fill with random data. + mac[0] &^= 0x1 // Clear multicast bit. + mac[0] |= 0x2 // Set local assignment bit (IEEE802). + ep, err := fdbased.New(&fdbased.Options{ + FDs: []int{fd}, + MTU: uint32(*mtu), + EthernetHeader: true, + Address: tcpip.LinkAddress(mac), + // Enable checksum generation as we need to generate valid + // checksums for the veth device to deliver our packets to the + // peer. But we do want to disable checksum verification as veth + // devices do perform GRO and the linux host kernel may not + // regenerate valid checksums after GRO. + TXChecksumOffload: false, + RXChecksumOffload: true, + PacketDispatchMode: fdbased.RecvMMsg, + GSOMaxSize: uint32(*gso), + SoftwareGSOEnabled: *swgso, + }) + if err != nil { + return nil, fmt.Errorf("failed to create FD endpoint: %v", err) + } + if err := s.CreateNIC(nicID, ep); err != nil { + return nil, fmt.Errorf("error creating NIC %q: %v", *iface, err) + } + if err := s.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + return nil, fmt.Errorf("error adding ARP address to %q: %v", *iface, err) + } + if err := s.AddAddress(nicID, ipv4.ProtocolNumber, parsedAddr); err != nil { + return nil, fmt.Errorf("error adding IP address to %q: %v", *iface, err) + } + + subnet, err := tcpip.NewSubnet(parsedDest, parsedMask) + if err != nil { + return nil, fmt.Errorf("tcpip.Subnet(%s, %s): %s", parsedDest, parsedMask, err) + } + // Add default route; we only support + s.SetRouteTable([]tcpip.Route{ + { + Destination: subnet, + NIC: nicID, + }, + }) + + // Set protocol options. + if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(*sack)); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption for SACKEnabled failed: %v", err) + } + + // Set Congestion Control to cubic if requested. + if *cubic { + if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.CongestionControlOption("cubic")); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption for CongestionControlOption(cubic) failed: %v", err) + } + } + + return netstackImpl{ + s: s, + addr: parsedAddr, + mode: mode, + }, nil +} + +func (n netstackImpl) dial(address string) (net.Conn, error) { + host, port, err := net.SplitHostPort(address) + if err != nil { + return nil, err + } + if host == "" { + // A host must be provided for the dial. + return nil, fmt.Errorf("no host provided") + } + portNumber, err := strconv.Atoi(port) + if err != nil { + return nil, err + } + addr := tcpip.FullAddress{ + NIC: nicID, + Addr: tcpip.Address(net.ParseIP(host).To4()), + Port: uint16(portNumber), + } + conn, err := gonet.DialTCP(n.s, addr, ipv4.ProtocolNumber) + if err != nil { + return nil, err + } + return conn, nil +} + +func (n netstackImpl) listen(port int) (net.Listener, error) { + addr := tcpip.FullAddress{ + NIC: nicID, + Port: uint16(port), + } + listener, err := gonet.NewListener(n.s, addr, ipv4.ProtocolNumber) + if err != nil { + return nil, err + } + return listener, nil +} + +var zeroFieldsRegexp = regexp.MustCompile(`\s*[a-zA-Z0-9]*:0`) + +func (n netstackImpl) printStats() { + // Don't show zero fields. + stats := zeroFieldsRegexp.ReplaceAllString(fmt.Sprintf("%+v", n.s.Stats()), "") + log.Printf("netstack %s Stats: %+v\n", n.mode, stats) +} + +// installProbe installs a TCP Probe function that will dump endpoint +// state to the specified file. It also returns a close func() that +// can be used to close the probeFile. +func (n netstackImpl) installProbe(probeFileName string) (close func()) { + // Install Probe to dump out end point state. + probeFile, err := os.Create(probeFileName) + if err != nil { + log.Fatalf("failed to create tcp_probe file %s: %v", probeFileName, err) + } + probeEncoder := gob.NewEncoder(probeFile) + // Install a TCP Probe. + n.s.AddTCPProbe(func(state stack.TCPEndpointState) { + probeEncoder.Encode(state) + }) + return func() { probeFile.Close() } +} + +func main() { + flag.Parse() + if *port == 0 { + log.Fatalf("no port provided") + } + if *forward == "" { + log.Fatalf("no forward provided") + } + // Seed the random number generator to ensure that we are given MAC addresses that don't + // for the case of the client and server stack. + rand.Seed(time.Now().UTC().UnixNano()) + + if *cpuprofile != "" { + f, err := os.Create(*cpuprofile) + if err != nil { + log.Fatal("could not create CPU profile: ", err) + } + defer func() { + if err := f.Close(); err != nil { + log.Print("error closing CPU profile: ", err) + } + }() + if err := pprof.StartCPUProfile(f); err != nil { + log.Fatal("could not start CPU profile: ", err) + } + defer pprof.StopCPUProfile() + } + + var ( + in impl + out impl + err error + ) + if *server { + in, err = newNetstackImpl("server") + if *serverTCPProbeFile != "" { + defer in.(netstackImpl).installProbe(*serverTCPProbeFile)() + } + + } else { + in = netImpl{} + } + if err != nil { + log.Fatalf("netstack error: %v", err) + } + if *client { + out, err = newNetstackImpl("client") + if *clientTCPProbeFile != "" { + defer out.(netstackImpl).installProbe(*clientTCPProbeFile)() + } + } else { + out = netImpl{} + } + if err != nil { + log.Fatalf("netstack error: %v", err) + } + + // Dial forward before binding. + var next net.Conn + for { + next, err = out.dial(*forward) + if err == nil { + break + } + time.Sleep(50 * time.Millisecond) + log.Printf("connect failed retrying: %v", err) + } + + // Bind once to the server socket. + listener, err := in.listen(*port) + if err != nil { + // Should not happen, everything must be bound by this time + // this proxy is started. + log.Fatalf("unable to listen: %v", err) + } + log.Printf("client=%v, server=%v, ready.", *client, *server) + + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGTERM) + go func() { + <-sigs + if *cpuprofile != "" { + pprof.StopCPUProfile() + } + if *memprofile != "" { + f, err := os.Create(*memprofile) + if err != nil { + log.Fatal("could not create memory profile: ", err) + } + defer func() { + if err := f.Close(); err != nil { + log.Print("error closing memory profile: ", err) + } + }() + runtime.GC() // get up-to-date statistics + if err := pprof.WriteHeapProfile(f); err != nil { + log.Fatalf("Unable to write heap profile: %v", err) + } + } + os.Exit(0) + }() + + for { + // Forward all connections. + inConn, err := listener.Accept() + if err != nil { + // This should not happen; we are listening + // successfully. Exhausted all available FDs? + log.Fatalf("accept error: %v", err) + } + log.Printf("incoming connection established.") + + // Copy both ways. + go io.Copy(inConn, next) + go io.Copy(next, inConn) + + // Print stats every second. + go func() { + t := time.NewTicker(time.Second) + defer t.Stop() + for { + <-t.C + in.printStats() + out.printStats() + } + }() + + for { + // Dial again. + next, err = out.dial(*forward) + if err == nil { + break + } + } + } +} diff --git a/benchmarks/workloads/BUILD b/benchmarks/workloads/BUILD new file mode 100644 index 000000000..643806105 --- /dev/null +++ b/benchmarks/workloads/BUILD @@ -0,0 +1,35 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "workloads", + srcs = ["__init__.py"], +) + +filegroup( + name = "files", + srcs = [ + "//benchmarks/workloads/ab:files", + "//benchmarks/workloads/absl:files", + "//benchmarks/workloads/curl:files", + "//benchmarks/workloads/ffmpeg:files", + "//benchmarks/workloads/fio:files", + "//benchmarks/workloads/httpd:files", + "//benchmarks/workloads/iperf:files", + "//benchmarks/workloads/netcat:files", + "//benchmarks/workloads/nginx:files", + "//benchmarks/workloads/node:files", + "//benchmarks/workloads/node_template:files", + "//benchmarks/workloads/redis:files", + "//benchmarks/workloads/redisbenchmark:files", + "//benchmarks/workloads/ruby:files", + "//benchmarks/workloads/ruby_template:files", + "//benchmarks/workloads/sleep:files", + "//benchmarks/workloads/sysbench:files", + "//benchmarks/workloads/syscall:files", + "//benchmarks/workloads/tensorflow:files", + "//benchmarks/workloads/true:files", + ], +) diff --git a/benchmarks/workloads/__init__.py b/benchmarks/workloads/__init__.py new file mode 100644 index 000000000..e12651e76 --- /dev/null +++ b/benchmarks/workloads/__init__.py @@ -0,0 +1,14 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Workloads, parsers and test data.""" diff --git a/benchmarks/workloads/ab/BUILD b/benchmarks/workloads/ab/BUILD new file mode 100644 index 000000000..e99a8d674 --- /dev/null +++ b/benchmarks/workloads/ab/BUILD @@ -0,0 +1,35 @@ +load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") + +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "ab", + srcs = ["__init__.py"], +) + +py_test( + name = "ab_test", + srcs = ["ab_test.py"], + python_version = "PY3", + deps = [ + ":ab", + requirement("attrs", False), + requirement("atomicwrites", False), + requirement("more-itertools", False), + requirement("pathlib2", False), + requirement("pluggy", False), + requirement("py", False), + requirement("pytest", True), + requirement("six", False), + ], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/ab/Dockerfile b/benchmarks/workloads/ab/Dockerfile new file mode 100644 index 000000000..0d0b6e2eb --- /dev/null +++ b/benchmarks/workloads/ab/Dockerfile @@ -0,0 +1,15 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + apache2-utils \ + && rm -rf /var/lib/apt/lists/* + +# Parameterized workload. +ENV requests 5000 +ENV connections 10 +ENV host localhost +ENV port 8080 +ENV path notfound +CMD ["sh", "-c", "ab -n ${requests} -c ${connections} http://${host}:${port}/${path}"] diff --git a/benchmarks/workloads/ab/__init__.py b/benchmarks/workloads/ab/__init__.py new file mode 100644 index 000000000..eedf8e083 --- /dev/null +++ b/benchmarks/workloads/ab/__init__.py @@ -0,0 +1,88 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Apachebench tool.""" + +import re + +SAMPLE_DATA = """This is ApacheBench, Version 2.3 <$Revision: 1826891 $> +Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/ +Licensed to The Apache Software Foundation, http://www.apache.org/ + +Benchmarking 10.10.10.10 (be patient).....done + + +Server Software: Apache/2.4.38 +Server Hostname: 10.10.10.10 +Server Port: 80 + +Document Path: /latin10k.txt +Document Length: 210 bytes + +Concurrency Level: 1 +Time taken for tests: 0.180 seconds +Complete requests: 100 +Failed requests: 0 +Non-2xx responses: 100 +Total transferred: 38800 bytes +HTML transferred: 21000 bytes +Requests per second: 556.44 [#/sec] (mean) +Time per request: 1.797 [ms] (mean) +Time per request: 1.797 [ms] (mean, across all concurrent requests) +Transfer rate: 210.84 [Kbytes/sec] received + +Connection Times (ms) + min mean[+/-sd] median max +Connect: 0 0 0.2 0 2 +Processing: 1 2 1.0 1 8 +Waiting: 1 1 1.0 1 7 +Total: 1 2 1.2 1 10 + +Percentage of the requests served within a certain time (ms) + 50% 1 + 66% 2 + 75% 2 + 80% 2 + 90% 2 + 95% 3 + 98% 7 + 99% 10 + 100% 10 (longest request)""" + + +# pylint: disable=unused-argument +def sample(**kwargs) -> str: + return SAMPLE_DATA + + +# pylint: disable=unused-argument +def transfer_rate(data: str, **kwargs) -> float: + """Mean transfer rate in Kbytes/sec.""" + regex = r"Transfer rate:\s+(\d+\.?\d+?)\s+\[Kbytes/sec\]\s+received" + return float(re.compile(regex).search(data).group(1)) + + +# pylint: disable=unused-argument +def latency(data: str, **kwargs) -> float: + """Mean latency in milliseconds.""" + regex = r"Total:\s+\d+\s+(\d+)\s+(\d+\.?\d+?)\s+\d+\s+\d+\s" + res = re.compile(regex).search(data) + return float(res.group(1)) + + +# pylint: disable=unused-argument +def requests_per_second(data: str, **kwargs) -> float: + """Requests per second.""" + regex = r"Requests per second:\s+(\d+\.?\d+?)\s+" + res = re.compile(regex).search(data) + return float(res.group(1)) diff --git a/benchmarks/workloads/ab/ab_test.py b/benchmarks/workloads/ab/ab_test.py new file mode 100644 index 000000000..4afac2996 --- /dev/null +++ b/benchmarks/workloads/ab/ab_test.py @@ -0,0 +1,42 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Parser test.""" + +import sys + +import pytest + +from benchmarks.workloads import ab + + +def test_transfer_rate_parser(): + """Test transfer rate parser.""" + res = ab.transfer_rate(ab.sample()) + assert res == 210.84 + + +def test_latency_parser(): + """Test latency parser.""" + res = ab.latency(ab.sample()) + assert res == 2 + + +def test_requests_per_second(): + """Test requests per second parser.""" + res = ab.requests_per_second(ab.sample()) + assert res == 556.44 + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/absl/BUILD b/benchmarks/workloads/absl/BUILD new file mode 100644 index 000000000..bb499620e --- /dev/null +++ b/benchmarks/workloads/absl/BUILD @@ -0,0 +1,35 @@ +load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") + +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "absl", + srcs = ["__init__.py"], +) + +py_test( + name = "absl_test", + srcs = ["absl_test.py"], + python_version = "PY3", + deps = [ + ":absl", + requirement("attrs", False), + requirement("atomicwrites", False), + requirement("more-itertools", False), + requirement("pathlib2", False), + requirement("pluggy", False), + requirement("py", False), + requirement("pytest", True), + requirement("six", False), + ], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/absl/Dockerfile b/benchmarks/workloads/absl/Dockerfile new file mode 100644 index 000000000..e935c5ddc --- /dev/null +++ b/benchmarks/workloads/absl/Dockerfile @@ -0,0 +1,24 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + wget \ + git \ + pkg-config \ + zip \ + g++ \ + zlib1g-dev \ + unzip \ + python3 \ + && rm -rf /var/lib/apt/lists/* +RUN wget https://github.com/bazelbuild/bazel/releases/download/0.27.0/bazel-0.27.0-installer-linux-x86_64.sh +RUN chmod +x bazel-0.27.0-installer-linux-x86_64.sh +RUN ./bazel-0.27.0-installer-linux-x86_64.sh + +RUN git clone https://github.com/abseil/abseil-cpp.git +WORKDIR abseil-cpp +RUN git checkout 43ef2148c0936ebf7cb4be6b19927a9d9d145b8f +RUN bazel clean +ENV path "absl/base/..." +CMD bazel build ${path} 2>&1 diff --git a/benchmarks/workloads/absl/__init__.py b/benchmarks/workloads/absl/__init__.py new file mode 100644 index 000000000..b40e3f915 --- /dev/null +++ b/benchmarks/workloads/absl/__init__.py @@ -0,0 +1,63 @@ +# python3 +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ABSL build benchmark.""" + +import re + +SAMPLE_BAZEL_OUTPUT = """Extracting Bazel installation... +Starting local Bazel server and connecting to it... +Loading: +Loading: 0 packages loaded +Loading: 0 packages loaded + currently loading: absl/algorithm ... (11 packages) +Analyzing: 241 targets (16 packages loaded, 0 targets configured) +Analyzing: 241 targets (21 packages loaded, 617 targets configured) +Analyzing: 241 targets (27 packages loaded, 687 targets configured) +Analyzing: 241 targets (32 packages loaded, 1105 targets configured) +Analyzing: 241 targets (32 packages loaded, 1294 targets configured) +Analyzing: 241 targets (35 packages loaded, 1575 targets configured) +Analyzing: 241 targets (35 packages loaded, 1575 targets configured) +Analyzing: 241 targets (36 packages loaded, 1603 targets configured) +Analyzing: 241 targets (36 packages loaded, 1603 targets configured) +INFO: Analyzed 241 targets (37 packages loaded, 1864 targets configured). +INFO: Found 241 targets... +[0 / 5] [Prepa] BazelWorkspaceStatusAction stable-status.txt +[16 / 50] [Analy] Compiling absl/base/dynamic_annotations.cc ... (20 actions, 10 running) +[60 / 77] Compiling external/com_google_googletest/googletest/src/gtest.cc; 5s processwrapper-sandbox ... (12 actions, 11 running) +[158 / 174] Compiling absl/container/internal/raw_hash_set_test.cc; 2s processwrapper-sandbox ... (12 actions, 11 running) +[278 / 302] Compiling absl/container/internal/raw_hash_set_test.cc; 6s processwrapper-sandbox ... (12 actions, 11 running) +[384 / 406] Compiling absl/container/internal/raw_hash_set_test.cc; 10s processwrapper-sandbox ... (12 actions, 11 running) +[581 / 604] Compiling absl/container/flat_hash_set_test.cc; 11s processwrapper-sandbox ... (12 actions, 11 running) +[722 / 745] Compiling absl/container/node_hash_set_test.cc; 9s processwrapper-sandbox ... (12 actions, 11 running) +[846 / 867] Compiling absl/hash/hash_test.cc; 11s processwrapper-sandbox ... (12 actions, 11 running) +INFO: From Compiling absl/debugging/symbolize_test.cc: +/tmp/cclCVipU.s: Assembler messages: +/tmp/cclCVipU.s:1662: Warning: ignoring changed section attributes for .text +[999 / 1,022] Compiling absl/hash/hash_test.cc; 19s processwrapper-sandbox ... (12 actions, 11 running) +[1,082 / 1,084] Compiling absl/container/flat_hash_map_test.cc; 7s processwrapper-sandbox +INFO: Elapsed time: 81.861s, Critical Path: 23.81s +INFO: 515 processes: 515 processwrapper-sandbox. +INFO: Build completed successfully, 1084 total actions +INFO: Build completed successfully, 1084 total actions""" + + +def sample(): + return SAMPLE_BAZEL_OUTPUT + + +# pylint: disable=unused-argument +def elapsed_time(data: str, **kwargs) -> float: + """Returns the elapsed time for running an absl build.""" + return float(re.compile(r"Elapsed time: (\d*.?\d*)s").search(data).group(1)) diff --git a/benchmarks/workloads/absl/absl_test.py b/benchmarks/workloads/absl/absl_test.py new file mode 100644 index 000000000..41f216999 --- /dev/null +++ b/benchmarks/workloads/absl/absl_test.py @@ -0,0 +1,31 @@ +# python3 +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ABSL build test.""" + +import sys + +import pytest + +from benchmarks.workloads import absl + + +def test_elapsed_time(): + """Test elapsed_time.""" + res = absl.elapsed_time(absl.sample()) + assert res == 81.861 + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/curl/BUILD b/benchmarks/workloads/curl/BUILD new file mode 100644 index 000000000..83f3c71a0 --- /dev/null +++ b/benchmarks/workloads/curl/BUILD @@ -0,0 +1,11 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/curl/Dockerfile b/benchmarks/workloads/curl/Dockerfile new file mode 100644 index 000000000..336cb088a --- /dev/null +++ b/benchmarks/workloads/curl/Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Accept a host and port parameter. +ENV host localhost +ENV port 8080 + +# Spin until we make a successful request. +CMD ["sh", "-c", "while ! curl -v -i http://$host:$port; do true; done"] diff --git a/benchmarks/workloads/ffmpeg/BUILD b/benchmarks/workloads/ffmpeg/BUILD new file mode 100644 index 000000000..c1f2afc40 --- /dev/null +++ b/benchmarks/workloads/ffmpeg/BUILD @@ -0,0 +1,16 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "ffmpeg", + srcs = ["__init__.py"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/ffmpeg/Dockerfile b/benchmarks/workloads/ffmpeg/Dockerfile new file mode 100644 index 000000000..f2f530d7c --- /dev/null +++ b/benchmarks/workloads/ffmpeg/Dockerfile @@ -0,0 +1,10 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* +WORKDIR /media +ADD https://samples.ffmpeg.org/MPEG-4/video.mp4 video.mp4 +CMD ["ffmpeg", "-i", "video.mp4", "-c:v", "libx264", "-preset", "veryslow", "output.mp4"] diff --git a/benchmarks/workloads/ffmpeg/__init__.py b/benchmarks/workloads/ffmpeg/__init__.py new file mode 100644 index 000000000..7578a443b --- /dev/null +++ b/benchmarks/workloads/ffmpeg/__init__.py @@ -0,0 +1,20 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Simple ffmpeg workload.""" + + +# pylint: disable=unused-argument +def run_time(value, **kwargs): + """Returns the startup and runtime of the ffmpeg workload in seconds.""" + return value diff --git a/benchmarks/workloads/fio/BUILD b/benchmarks/workloads/fio/BUILD new file mode 100644 index 000000000..7fc96cfa5 --- /dev/null +++ b/benchmarks/workloads/fio/BUILD @@ -0,0 +1,35 @@ +load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") + +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "fio", + srcs = ["__init__.py"], +) + +py_test( + name = "fio_test", + srcs = ["fio_test.py"], + python_version = "PY3", + deps = [ + ":fio", + requirement("attrs", False), + requirement("atomicwrites", False), + requirement("more-itertools", False), + requirement("pathlib2", False), + requirement("pluggy", False), + requirement("py", False), + requirement("pytest", True), + requirement("six", False), + ], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/fio/Dockerfile b/benchmarks/workloads/fio/Dockerfile new file mode 100644 index 000000000..b3cf864eb --- /dev/null +++ b/benchmarks/workloads/fio/Dockerfile @@ -0,0 +1,23 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + fio \ + && rm -rf /var/lib/apt/lists/* + +# Parameterized test. +ENV test write +ENV ioengine sync +ENV size 5000000 +ENV iodepth 4 +ENV blocksize "1m" +ENV time "" +ENV path "/disk/file.dat" +ENV ramp_time 0 + +CMD ["sh", "-c", "fio --output-format=json --name=test --ramp_time=${ramp_time} --ioengine=${ioengine} --size=${size} \ +--filename=${path} --iodepth=${iodepth} --bs=${blocksize} --rw=${test} ${time}"] + + + diff --git a/benchmarks/workloads/fio/__init__.py b/benchmarks/workloads/fio/__init__.py new file mode 100644 index 000000000..52711e956 --- /dev/null +++ b/benchmarks/workloads/fio/__init__.py @@ -0,0 +1,369 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""FIO benchmark tool.""" + +import json + +SAMPLE_DATA = """ +{ + "fio version" : "fio-3.1", + "timestamp" : 1554837456, + "timestamp_ms" : 1554837456621, + "time" : "Tue Apr 9 19:17:36 2019", + "jobs" : [ + { + "jobname" : "test", + "groupid" : 0, + "error" : 0, + "eta" : 2147483647, + "elapsed" : 1, + "job options" : { + "name" : "test", + "ioengine" : "sync", + "size" : "1073741824", + "filename" : "/disk/file.dat", + "iodepth" : "4", + "bs" : "4096", + "rw" : "write" + }, + "read" : { + "io_bytes" : 0, + "io_kbytes" : 0, + "bw" : 0, + "iops" : 0.000000, + "runtime" : 0, + "total_ios" : 0, + "short_ios" : 0, + "drop_ios" : 0, + "slat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000 + }, + "clat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "percentile" : { + "1.000000" : 0, + "5.000000" : 0, + "10.000000" : 0, + "20.000000" : 0, + "30.000000" : 0, + "40.000000" : 0, + "50.000000" : 0, + "60.000000" : 0, + "70.000000" : 0, + "80.000000" : 0, + "90.000000" : 0, + "95.000000" : 0, + "99.000000" : 0, + "99.500000" : 0, + "99.900000" : 0, + "99.950000" : 0, + "99.990000" : 0, + "0.00" : 0, + "0.00" : 0, + "0.00" : 0 + } + }, + "lat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000 + }, + "bw_min" : 0, + "bw_max" : 0, + "bw_agg" : 0.000000, + "bw_mean" : 0.000000, + "bw_dev" : 0.000000, + "bw_samples" : 0, + "iops_min" : 0, + "iops_max" : 0, + "iops_mean" : 0.000000, + "iops_stddev" : 0.000000, + "iops_samples" : 0 + }, + "write" : { + "io_bytes" : 1073741824, + "io_kbytes" : 1048576, + "bw" : 1753471, + "iops" : 438367.892977, + "runtime" : 598, + "total_ios" : 262144, + "short_ios" : 0, + "drop_ios" : 0, + "slat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000 + }, + "clat_ns" : { + "min" : 1693, + "max" : 754733, + "mean" : 2076.404373, + "stddev" : 1724.195529, + "percentile" : { + "1.000000" : 1736, + "5.000000" : 1752, + "10.000000" : 1768, + "20.000000" : 1784, + "30.000000" : 1800, + "40.000000" : 1800, + "50.000000" : 1816, + "60.000000" : 1816, + "70.000000" : 1848, + "80.000000" : 1928, + "90.000000" : 2512, + "95.000000" : 2992, + "99.000000" : 6176, + "99.500000" : 6304, + "99.900000" : 11328, + "99.950000" : 15168, + "99.990000" : 17792, + "0.00" : 0, + "0.00" : 0, + "0.00" : 0 + } + }, + "lat_ns" : { + "min" : 1731, + "max" : 754770, + "mean" : 2117.878979, + "stddev" : 1730.290512 + }, + "bw_min" : 1731120, + "bw_max" : 1731120, + "bw_agg" : 98.725328, + "bw_mean" : 1731120.000000, + "bw_dev" : 0.000000, + "bw_samples" : 1, + "iops_min" : 432780, + "iops_max" : 432780, + "iops_mean" : 432780.000000, + "iops_stddev" : 0.000000, + "iops_samples" : 1 + }, + "trim" : { + "io_bytes" : 0, + "io_kbytes" : 0, + "bw" : 0, + "iops" : 0.000000, + "runtime" : 0, + "total_ios" : 0, + "short_ios" : 0, + "drop_ios" : 0, + "slat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000 + }, + "clat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "percentile" : { + "1.000000" : 0, + "5.000000" : 0, + "10.000000" : 0, + "20.000000" : 0, + "30.000000" : 0, + "40.000000" : 0, + "50.000000" : 0, + "60.000000" : 0, + "70.000000" : 0, + "80.000000" : 0, + "90.000000" : 0, + "95.000000" : 0, + "99.000000" : 0, + "99.500000" : 0, + "99.900000" : 0, + "99.950000" : 0, + "99.990000" : 0, + "0.00" : 0, + "0.00" : 0, + "0.00" : 0 + } + }, + "lat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000 + }, + "bw_min" : 0, + "bw_max" : 0, + "bw_agg" : 0.000000, + "bw_mean" : 0.000000, + "bw_dev" : 0.000000, + "bw_samples" : 0, + "iops_min" : 0, + "iops_max" : 0, + "iops_mean" : 0.000000, + "iops_stddev" : 0.000000, + "iops_samples" : 0 + }, + "usr_cpu" : 17.922948, + "sys_cpu" : 81.574539, + "ctx" : 3, + "majf" : 0, + "minf" : 10, + "iodepth_level" : { + "1" : 100.000000, + "2" : 0.000000, + "4" : 0.000000, + "8" : 0.000000, + "16" : 0.000000, + "32" : 0.000000, + ">=64" : 0.000000 + }, + "latency_ns" : { + "2" : 0.000000, + "4" : 0.000000, + "10" : 0.000000, + "20" : 0.000000, + "50" : 0.000000, + "100" : 0.000000, + "250" : 0.000000, + "500" : 0.000000, + "750" : 0.000000, + "1000" : 0.000000 + }, + "latency_us" : { + "2" : 82.737350, + "4" : 12.605286, + "10" : 4.543686, + "20" : 0.107956, + "50" : 0.010000, + "100" : 0.000000, + "250" : 0.000000, + "500" : 0.000000, + "750" : 0.000000, + "1000" : 0.010000 + }, + "latency_ms" : { + "2" : 0.000000, + "4" : 0.000000, + "10" : 0.000000, + "20" : 0.000000, + "50" : 0.000000, + "100" : 0.000000, + "250" : 0.000000, + "500" : 0.000000, + "750" : 0.000000, + "1000" : 0.000000, + "2000" : 0.000000, + ">=2000" : 0.000000 + }, + "latency_depth" : 4, + "latency_target" : 0, + "latency_percentile" : 100.000000, + "latency_window" : 0 + } + ], + "disk_util" : [ + { + "name" : "dm-1", + "read_ios" : 0, + "write_ios" : 3, + "read_merges" : 0, + "write_merges" : 0, + "read_ticks" : 0, + "write_ticks" : 0, + "in_queue" : 0, + "util" : 0.000000, + "aggr_read_ios" : 0, + "aggr_write_ios" : 3, + "aggr_read_merges" : 0, + "aggr_write_merge" : 0, + "aggr_read_ticks" : 0, + "aggr_write_ticks" : 0, + "aggr_in_queue" : 0, + "aggr_util" : 0.000000 + }, + { + "name" : "dm-0", + "read_ios" : 0, + "write_ios" : 3, + "read_merges" : 0, + "write_merges" : 0, + "read_ticks" : 0, + "write_ticks" : 0, + "in_queue" : 0, + "util" : 0.000000, + "aggr_read_ios" : 0, + "aggr_write_ios" : 3, + "aggr_read_merges" : 0, + "aggr_write_merge" : 0, + "aggr_read_ticks" : 0, + "aggr_write_ticks" : 2, + "aggr_in_queue" : 0, + "aggr_util" : 0.000000 + }, + { + "name" : "nvme0n1", + "read_ios" : 0, + "write_ios" : 3, + "read_merges" : 0, + "write_merges" : 0, + "read_ticks" : 0, + "write_ticks" : 2, + "in_queue" : 0, + "util" : 0.000000 + } + ] +} +""" + + +# pylint: disable=unused-argument +def sample(**kwargs) -> str: + return SAMPLE_DATA + + +# pylint: disable=unused-argument +def read_bandwidth(data: str, **kwargs) -> int: + """File I/O bandwidth.""" + return json.loads(data)["jobs"][0]["read"]["bw"] * 1024 + + +# pylint: disable=unused-argument +def write_bandwidth(data: str, **kwargs) -> int: + """File I/O bandwidth.""" + return json.loads(data)["jobs"][0]["write"]["bw"] * 1024 + + +# pylint: disable=unused-argument +def read_io_ops(data: str, **kwargs) -> float: + """File I/O operations per second.""" + return float(json.loads(data)["jobs"][0]["read"]["iops"]) + + +# pylint: disable=unused-argument +def write_io_ops(data: str, **kwargs) -> float: + """File I/O operations per second.""" + return float(json.loads(data)["jobs"][0]["write"]["iops"]) + + +# Change function names so we just print "bandwidth" and "io_ops". +read_bandwidth.__name__ = "bandwidth" +write_bandwidth.__name__ = "bandwidth" +read_io_ops.__name__ = "io_ops" +write_io_ops.__name__ = "io_ops" diff --git a/benchmarks/workloads/fio/fio_test.py b/benchmarks/workloads/fio/fio_test.py new file mode 100644 index 000000000..04a6eeb7e --- /dev/null +++ b/benchmarks/workloads/fio/fio_test.py @@ -0,0 +1,44 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Parser tests.""" + +import sys + +import pytest + +from benchmarks.workloads import fio + + +def test_read_io_ops(): + """Test read ops parser.""" + assert fio.read_io_ops(fio.sample()) == 0.0 + + +def test_write_io_ops(): + """Test write ops parser.""" + assert fio.write_io_ops(fio.sample()) == 438367.892977 + + +def test_read_bandwidth(): + """Test read bandwidth parser.""" + assert fio.read_bandwidth(fio.sample()) == 0.0 + + +def test_write_bandwith(): + """Test write bandwidth parser.""" + assert fio.write_bandwidth(fio.sample()) == 1753471 * 1024 + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/httpd/BUILD b/benchmarks/workloads/httpd/BUILD new file mode 100644 index 000000000..83f3c71a0 --- /dev/null +++ b/benchmarks/workloads/httpd/BUILD @@ -0,0 +1,11 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/httpd/Dockerfile b/benchmarks/workloads/httpd/Dockerfile new file mode 100644 index 000000000..5259c8f4f --- /dev/null +++ b/benchmarks/workloads/httpd/Dockerfile @@ -0,0 +1,27 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + apache2 \ + && rm -rf /var/lib/apt/lists/* + +# Link the htdoc directory to tmp. +RUN mkdir -p /usr/local/apache2/htdocs && \ + cd /usr/local/apache2 && ln -s /tmp htdocs + +# Generate a bunch of relevant files. +RUN mkdir -p /local && \ + for size in 1 10 100 1000 1024 10240; do \ + dd if=/dev/zero of=/local/latin${size}k.txt count=${size} bs=1024; \ + done + +# Standard settings. +ENV APACHE_RUN_DIR /tmp +ENV APACHE_RUN_USER nobody +ENV APACHE_RUN_GROUP nogroup +ENV APACHE_LOG_DIR /tmp +ENV APACHE_PID_FILE /tmp/apache.pid + +# Copy on start-up; serve everything from /tmp (including the configuration). +CMD ["sh", "-c", "cp -a /local/* /tmp && apache2 -c \"ServerName localhost\" -c \"DocumentRoot /tmp\" -X"] diff --git a/benchmarks/workloads/iperf/BUILD b/benchmarks/workloads/iperf/BUILD new file mode 100644 index 000000000..fe0acbfce --- /dev/null +++ b/benchmarks/workloads/iperf/BUILD @@ -0,0 +1,35 @@ +load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") + +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "iperf", + srcs = ["__init__.py"], +) + +py_test( + name = "iperf_test", + srcs = ["iperf_test.py"], + python_version = "PY3", + deps = [ + ":iperf", + requirement("attrs", False), + requirement("atomicwrites", False), + requirement("more-itertools", False), + requirement("pathlib2", False), + requirement("pluggy", False), + requirement("py", False), + requirement("pytest", True), + requirement("six", False), + ], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/iperf/Dockerfile b/benchmarks/workloads/iperf/Dockerfile new file mode 100644 index 000000000..9704c506c --- /dev/null +++ b/benchmarks/workloads/iperf/Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + iperf \ + && rm -rf /var/lib/apt/lists/* + +# Accept a host parameter. +ENV host "" +ENV port 5001 + +# Start as client if the host is provided. +CMD ["sh", "-c", "test -z \"${host}\" && iperf -s || iperf -f K --realtime -c ${host} -p ${port}"] diff --git a/benchmarks/workloads/iperf/__init__.py b/benchmarks/workloads/iperf/__init__.py new file mode 100644 index 000000000..3817a7ade --- /dev/null +++ b/benchmarks/workloads/iperf/__init__.py @@ -0,0 +1,40 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""iperf.""" + +import re + +SAMPLE_DATA = """ +------------------------------------------------------------ +Client connecting to 10.138.15.215, TCP port 32779 +TCP window size: 45.0 KByte (default) +------------------------------------------------------------ +[ 3] local 10.138.15.216 port 32866 connected with 10.138.15.215 port 32779 +[ ID] Interval Transfer Bandwidth +[ 3] 0.0-10.0 sec 459520 KBytes 45900 KBytes/sec + +""" + + +# pylint: disable=unused-argument +def sample(**kwargs) -> str: + return SAMPLE_DATA + + +# pylint: disable=unused-argument +def bandwidth(data: str, **kwargs) -> float: + """Calculate the bandwidth.""" + regex = r"\[\s*\d+\][^\n]+\s+(\d+\.?\d*)\s+KBytes/sec" + res = re.compile(regex).search(data) + return float(res.group(1)) * 1000 diff --git a/benchmarks/workloads/iperf/iperf_test.py b/benchmarks/workloads/iperf/iperf_test.py new file mode 100644 index 000000000..6959b7e8a --- /dev/null +++ b/benchmarks/workloads/iperf/iperf_test.py @@ -0,0 +1,28 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for iperf.""" + +import sys + +import pytest + +from benchmarks.workloads import iperf + + +def test_bandwidth(): + assert iperf.bandwidth(iperf.sample()) == 45900 * 1000 + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/netcat/BUILD b/benchmarks/workloads/netcat/BUILD new file mode 100644 index 000000000..83f3c71a0 --- /dev/null +++ b/benchmarks/workloads/netcat/BUILD @@ -0,0 +1,11 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/netcat/Dockerfile b/benchmarks/workloads/netcat/Dockerfile new file mode 100644 index 000000000..d8548d89a --- /dev/null +++ b/benchmarks/workloads/netcat/Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + netcat \ + && rm -rf /var/lib/apt/lists/* + +# Accept a host and port parameter. +ENV host localhost +ENV port 8080 + +# Spin until we make a successful request. +CMD ["sh", "-c", "while ! nc -zv $host $port; do true; done"] diff --git a/benchmarks/workloads/nginx/BUILD b/benchmarks/workloads/nginx/BUILD new file mode 100644 index 000000000..83f3c71a0 --- /dev/null +++ b/benchmarks/workloads/nginx/BUILD @@ -0,0 +1,11 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/nginx/Dockerfile b/benchmarks/workloads/nginx/Dockerfile new file mode 100644 index 000000000..b64eb52ae --- /dev/null +++ b/benchmarks/workloads/nginx/Dockerfile @@ -0,0 +1 @@ +FROM nginx:1.15.10 diff --git a/benchmarks/workloads/node/BUILD b/benchmarks/workloads/node/BUILD new file mode 100644 index 000000000..59460d02f --- /dev/null +++ b/benchmarks/workloads/node/BUILD @@ -0,0 +1,13 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + "index.js", + "package.json", + ], +) diff --git a/benchmarks/workloads/node/Dockerfile b/benchmarks/workloads/node/Dockerfile new file mode 100644 index 000000000..139a38bf5 --- /dev/null +++ b/benchmarks/workloads/node/Dockerfile @@ -0,0 +1,2 @@ +FROM node:onbuild +CMD ["node", "index.js"] diff --git a/benchmarks/workloads/node/index.js b/benchmarks/workloads/node/index.js new file mode 100644 index 000000000..584158462 --- /dev/null +++ b/benchmarks/workloads/node/index.js @@ -0,0 +1,28 @@ +'use strict'; + +var start = new Date().getTime(); + +// Load dependencies to simulate an average nodejs app. +var req_0 = require('async'); +var req_1 = require('bluebird'); +var req_2 = require('firebase'); +var req_3 = require('firebase-admin'); +var req_4 = require('@google-cloud/container'); +var req_5 = require('@google-cloud/logging'); +var req_6 = require('@google-cloud/monitoring'); +var req_7 = require('@google-cloud/spanner'); +var req_8 = require('lodash'); +var req_9 = require('mailgun-js'); +var req_10 = require('request'); +var express = require('express'); +var app = express(); + +var loaded = new Date().getTime() - start; +app.get('/', function(req, res) { + res.send('Hello World!<br>Loaded in ' + loaded + 'ms'); +}); + +console.log('Loaded in ' + loaded + ' ms'); +app.listen(8080, function() { + console.log('Listening on port 8080...'); +}); diff --git a/benchmarks/workloads/node/package.json b/benchmarks/workloads/node/package.json new file mode 100644 index 000000000..c00b9b3cb --- /dev/null +++ b/benchmarks/workloads/node/package.json @@ -0,0 +1,19 @@ +{ + "name": "node", + "version": "1.0.0", + "main": "index.js", + "dependencies": { + "@google-cloud/container": "^0.3.0", + "@google-cloud/logging": "^4.2.0", + "@google-cloud/monitoring": "^0.6.0", + "@google-cloud/spanner": "^2.2.1", + "async": "^2.6.1", + "bluebird": "^3.5.3", + "express": "^4.16.4", + "firebase": "^5.7.2", + "firebase-admin": "^6.4.0", + "lodash": "^4.17.11", + "mailgun-js": "^0.22.0", + "request": "^2.88.0" + } +} diff --git a/benchmarks/workloads/node_template/BUILD b/benchmarks/workloads/node_template/BUILD new file mode 100644 index 000000000..ae7f121d3 --- /dev/null +++ b/benchmarks/workloads/node_template/BUILD @@ -0,0 +1,15 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + "index.hbs", + "index.js", + "package.json", + "package-lock.json", + ], +) diff --git a/benchmarks/workloads/node_template/Dockerfile b/benchmarks/workloads/node_template/Dockerfile new file mode 100644 index 000000000..7eb065d54 --- /dev/null +++ b/benchmarks/workloads/node_template/Dockerfile @@ -0,0 +1,5 @@ +FROM node:onbuild + +ENV host "127.0.0.1" + +CMD ["sh", "-c", "node index.js ${host}"] diff --git a/benchmarks/workloads/node_template/index.hbs b/benchmarks/workloads/node_template/index.hbs new file mode 100644 index 000000000..03feceb75 --- /dev/null +++ b/benchmarks/workloads/node_template/index.hbs @@ -0,0 +1,8 @@ +<!DOCTYPE html> +<html> +<body> + {{#each text}} + <p>{{this}}</p> + {{/each}} +</body> +</html> diff --git a/benchmarks/workloads/node_template/index.js b/benchmarks/workloads/node_template/index.js new file mode 100644 index 000000000..04a27f356 --- /dev/null +++ b/benchmarks/workloads/node_template/index.js @@ -0,0 +1,43 @@ +const app = require('express')(); +const path = require('path'); +const redis = require('redis'); +const srs = require('secure-random-string'); + +// The hostname is the first argument. +const host_name = process.argv[2]; + +var client = redis.createClient({host: host_name, detect_buffers: true}); + +app.set('views', __dirname); +app.set('view engine', 'hbs'); + +app.get('/', (req, res) => { + var tmp = []; + /* Pull four random keys from the redis server. */ + for (i = 0; i < 4; i++) { + client.get(Math.floor(Math.random() * (100)), function(err, reply) { + tmp.push(reply.toString()); + }); + } + + res.render('index', {text: tmp}); +}); + +/** + * Securely generate a random string. + * @param {number} len + * @return {string} + */ +function randomBody(len) { + return srs({alphanumeric: true, length: len}); +} + +/** Mutates one hundred keys randomly. */ +function generateText() { + for (i = 0; i < 100; i++) { + client.set(i, randomBody(1024)); + } +} + +generateText(); +app.listen(8080); diff --git a/benchmarks/workloads/node_template/package-lock.json b/benchmarks/workloads/node_template/package-lock.json new file mode 100644 index 000000000..580e68aa5 --- /dev/null +++ b/benchmarks/workloads/node_template/package-lock.json @@ -0,0 +1,486 @@ +{ + "name": "nodedum", + "version": "1.0.0", + "lockfileVersion": 1, + "requires": true, + "dependencies": { + "accepts": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz", + "integrity": "sha1-63d99gEXI6OxTopywIBcjoZ0a9I=", + "requires": { + "mime-types": "~2.1.18", + "negotiator": "0.6.1" + } + }, + "array-flatten": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", + "integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI=" + }, + "async": { + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/async/-/async-2.6.2.tgz", + "integrity": "sha512-H1qVYh1MYhEEFLsP97cVKqCGo7KfCyTt6uEWqsTBr9SO84oK9Uwbyd/yCW+6rKJLHksBNUVWZDAjfS+Ccx0Bbg==", + "requires": { + "lodash": "^4.17.11" + } + }, + "body-parser": { + "version": "1.18.3", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.18.3.tgz", + "integrity": "sha1-WykhmP/dVTs6DyDe0FkrlWlVyLQ=", + "requires": { + "bytes": "3.0.0", + "content-type": "~1.0.4", + "debug": "2.6.9", + "depd": "~1.1.2", + "http-errors": "~1.6.3", + "iconv-lite": "0.4.23", + "on-finished": "~2.3.0", + "qs": "6.5.2", + "raw-body": "2.3.3", + "type-is": "~1.6.16" + } + }, + "bytes": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz", + "integrity": "sha1-0ygVQE1olpn4Wk6k+odV3ROpYEg=" + }, + "commander": { + "version": "2.20.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.0.tgz", + "integrity": "sha512-7j2y+40w61zy6YC2iRNpUe/NwhNyoXrYpHMrSunaMG64nRnaf96zO/KMQR4OyN/UnE5KLyEBnKHd4aG3rskjpQ==", + "optional": true + }, + "content-disposition": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.2.tgz", + "integrity": "sha1-DPaLud318r55YcOoUXjLhdunjLQ=" + }, + "content-type": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz", + "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==" + }, + "cookie": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.3.1.tgz", + "integrity": "sha1-5+Ch+e9DtMi6klxcWpboBtFoc7s=" + }, + "cookie-signature": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz", + "integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw=" + }, + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "requires": { + "ms": "2.0.0" + } + }, + "depd": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", + "integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak=" + }, + "destroy": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz", + "integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA=" + }, + "double-ended-queue": { + "version": "2.1.0-0", + "resolved": "https://registry.npmjs.org/double-ended-queue/-/double-ended-queue-2.1.0-0.tgz", + "integrity": "sha1-ED01J/0xUo9AGIEwyEHv3XgmTlw=" + }, + "ee-first": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", + "integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0=" + }, + "encodeurl": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", + "integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k=" + }, + "escape-html": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", + "integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg=" + }, + "etag": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", + "integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc=" + }, + "express": { + "version": "4.16.4", + "resolved": "https://registry.npmjs.org/express/-/express-4.16.4.tgz", + "integrity": "sha512-j12Uuyb4FMrd/qQAm6uCHAkPtO8FDTRJZBDd5D2KOL2eLaz1yUNdUB/NOIyq0iU4q4cFarsUCrnFDPBcnksuOg==", + "requires": { + "accepts": "~1.3.5", + "array-flatten": "1.1.1", + "body-parser": "1.18.3", + "content-disposition": "0.5.2", + "content-type": "~1.0.4", + "cookie": "0.3.1", + "cookie-signature": "1.0.6", + "debug": "2.6.9", + "depd": "~1.1.2", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "finalhandler": "1.1.1", + "fresh": "0.5.2", + "merge-descriptors": "1.0.1", + "methods": "~1.1.2", + "on-finished": "~2.3.0", + "parseurl": "~1.3.2", + "path-to-regexp": "0.1.7", + "proxy-addr": "~2.0.4", + "qs": "6.5.2", + "range-parser": "~1.2.0", + "safe-buffer": "5.1.2", + "send": "0.16.2", + "serve-static": "1.13.2", + "setprototypeof": "1.1.0", + "statuses": "~1.4.0", + "type-is": "~1.6.16", + "utils-merge": "1.0.1", + "vary": "~1.1.2" + } + }, + "finalhandler": { + "version": "1.1.1", + "resolved": "http://registry.npmjs.org/finalhandler/-/finalhandler-1.1.1.tgz", + "integrity": "sha512-Y1GUDo39ez4aHAw7MysnUD5JzYX+WaIj8I57kO3aEPT1fFRL4sr7mjei97FgnwhAyyzRYmQZaTHb2+9uZ1dPtg==", + "requires": { + "debug": "2.6.9", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "on-finished": "~2.3.0", + "parseurl": "~1.3.2", + "statuses": "~1.4.0", + "unpipe": "~1.0.0" + } + }, + "foreachasync": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/foreachasync/-/foreachasync-3.0.0.tgz", + "integrity": "sha1-VQKYfchxS+M5IJfzLgBxyd7gfPY=" + }, + "forwarded": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz", + "integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ=" + }, + "fresh": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", + "integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac=" + }, + "handlebars": { + "version": "4.0.14", + "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.0.14.tgz", + "integrity": "sha512-E7tDoyAA8ilZIV3xDJgl18sX3M8xB9/fMw8+mfW4msLW8jlX97bAnWgT3pmaNXuvzIEgSBMnAHfuXsB2hdzfow==", + "requires": { + "async": "^2.5.0", + "optimist": "^0.6.1", + "source-map": "^0.6.1", + "uglify-js": "^3.1.4" + } + }, + "hbs": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/hbs/-/hbs-4.0.4.tgz", + "integrity": "sha512-esVlyV/V59mKkwFai5YmPRSNIWZzhqL5YMN0++ueMxyK1cCfPa5f6JiHtapPKAIVAhQR6rpGxow0troav9WMEg==", + "requires": { + "handlebars": "4.0.14", + "walk": "2.3.9" + } + }, + "http-errors": { + "version": "1.6.3", + "resolved": "http://registry.npmjs.org/http-errors/-/http-errors-1.6.3.tgz", + "integrity": "sha1-i1VoC7S+KDoLW/TqLjhYC+HZMg0=", + "requires": { + "depd": "~1.1.2", + "inherits": "2.0.3", + "setprototypeof": "1.1.0", + "statuses": ">= 1.4.0 < 2" + } + }, + "iconv-lite": { + "version": "0.4.23", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.23.tgz", + "integrity": "sha512-neyTUVFtahjf0mB3dZT77u+8O0QB89jFdnBkd5P1JgYPbPaia3gXXOVL2fq8VyU2gMMD7SaN7QukTB/pmXYvDA==", + "requires": { + "safer-buffer": ">= 2.1.2 < 3" + } + }, + "inherits": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", + "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" + }, + "ipaddr.js": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.8.0.tgz", + "integrity": "sha1-6qM9bd16zo9/b+DJygRA5wZzix4=" + }, + "lodash": { + "version": "4.17.15", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz", + "integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A==" + }, + "media-typer": { + "version": "0.3.0", + "resolved": "http://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", + "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=" + }, + "merge-descriptors": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", + "integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E=" + }, + "methods": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz", + "integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4=" + }, + "mime": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/mime/-/mime-1.4.1.tgz", + "integrity": "sha512-KI1+qOZu5DcW6wayYHSzR/tXKCDC5Om4s1z2QJjDULzLcmf3DvzS7oluY4HCTrc+9FiKmWUgeNLg7W3uIQvxtQ==" + }, + "mime-db": { + "version": "1.37.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.37.0.tgz", + "integrity": "sha512-R3C4db6bgQhlIhPU48fUtdVmKnflq+hRdad7IyKhtFj06VPNVdk2RhiYL3UjQIlso8L+YxAtFkobT0VK+S/ybg==" + }, + "mime-types": { + "version": "2.1.21", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.21.tgz", + "integrity": "sha512-3iL6DbwpyLzjR3xHSFNFeb9Nz/M8WDkX33t1GFQnFOllWk8pOrh/LSrB5OXlnlW5P9LH73X6loW/eogc+F5lJg==", + "requires": { + "mime-db": "~1.37.0" + } + }, + "minimist": { + "version": "0.0.10", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.10.tgz", + "integrity": "sha1-3j+YVD2/lggr5IrRoMfNqDYwHc8=" + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" + }, + "negotiator": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.1.tgz", + "integrity": "sha1-KzJxhOiZIQEXeyhWP7XnECrNDKk=" + }, + "on-finished": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz", + "integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=", + "requires": { + "ee-first": "1.1.1" + } + }, + "optimist": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz", + "integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=", + "requires": { + "minimist": "~0.0.1", + "wordwrap": "~0.0.2" + } + }, + "parseurl": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.2.tgz", + "integrity": "sha1-/CidTtiZMRlGDBViUyYs3I3mW/M=" + }, + "path-to-regexp": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz", + "integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w=" + }, + "proxy-addr": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.4.tgz", + "integrity": "sha512-5erio2h9jp5CHGwcybmxmVqHmnCBZeewlfJ0pex+UW7Qny7OOZXTtH56TGNyBizkgiOwhJtMKrVzDTeKcySZwA==", + "requires": { + "forwarded": "~0.1.2", + "ipaddr.js": "1.8.0" + } + }, + "qs": { + "version": "6.5.2", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz", + "integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA==" + }, + "range-parser": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.0.tgz", + "integrity": "sha1-9JvmtIeJTdxA3MlKMi9hEJLgDV4=" + }, + "raw-body": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.3.3.tgz", + "integrity": "sha512-9esiElv1BrZoI3rCDuOuKCBRbuApGGaDPQfjSflGxdy4oyzqghxu6klEkkVIvBje+FF0BX9coEv8KqW6X/7njw==", + "requires": { + "bytes": "3.0.0", + "http-errors": "1.6.3", + "iconv-lite": "0.4.23", + "unpipe": "1.0.0" + } + }, + "redis": { + "version": "2.8.0", + "resolved": "https://registry.npmjs.org/redis/-/redis-2.8.0.tgz", + "integrity": "sha512-M1OkonEQwtRmZv4tEWF2VgpG0JWJ8Fv1PhlgT5+B+uNq2cA3Rt1Yt/ryoR+vQNOQcIEgdCdfH0jr3bDpihAw1A==", + "requires": { + "double-ended-queue": "^2.1.0-0", + "redis-commands": "^1.2.0", + "redis-parser": "^2.6.0" + }, + "dependencies": { + "redis-commands": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.4.0.tgz", + "integrity": "sha512-cu8EF+MtkwI4DLIT0x9P8qNTLFhQD4jLfxLR0cCNkeGzs87FN6879JOJwNQR/1zD7aSYNbU0hgsV9zGY71Itvw==" + }, + "redis-parser": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-2.6.0.tgz", + "integrity": "sha1-Uu0J2srBCPGmMcB+m2mUHnoZUEs=" + } + } + }, + "redis-commands": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.5.0.tgz", + "integrity": "sha512-6KxamqpZ468MeQC3bkWmCB1fp56XL64D4Kf0zJSwDZbVLLm7KFkoIcHrgRvQ+sk8dnhySs7+yBg94yIkAK7aJg==" + }, + "redis-parser": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-2.6.0.tgz", + "integrity": "sha1-Uu0J2srBCPGmMcB+m2mUHnoZUEs=" + }, + "safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" + }, + "safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" + }, + "secure-random-string": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/secure-random-string/-/secure-random-string-1.1.0.tgz", + "integrity": "sha512-V/h8jqoz58zklNGybVhP++cWrxEPXlLM/6BeJ4e0a8zlb4BsbYRzFs16snrxByPa5LUxCVTD3M6EYIVIHR1fAg==" + }, + "send": { + "version": "0.16.2", + "resolved": "https://registry.npmjs.org/send/-/send-0.16.2.tgz", + "integrity": "sha512-E64YFPUssFHEFBvpbbjr44NCLtI1AohxQ8ZSiJjQLskAdKuriYEP6VyGEsRDH8ScozGpkaX1BGvhanqCwkcEZw==", + "requires": { + "debug": "2.6.9", + "depd": "~1.1.2", + "destroy": "~1.0.4", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "fresh": "0.5.2", + "http-errors": "~1.6.2", + "mime": "1.4.1", + "ms": "2.0.0", + "on-finished": "~2.3.0", + "range-parser": "~1.2.0", + "statuses": "~1.4.0" + } + }, + "serve-static": { + "version": "1.13.2", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.13.2.tgz", + "integrity": "sha512-p/tdJrO4U387R9oMjb1oj7qSMaMfmOyd4j9hOFoxZe2baQszgHcSWjuya/CiT5kgZZKRudHNOA0pYXOl8rQ5nw==", + "requires": { + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "parseurl": "~1.3.2", + "send": "0.16.2" + } + }, + "setprototypeof": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.0.tgz", + "integrity": "sha512-BvE/TwpZX4FXExxOxZyRGQQv651MSwmWKZGqvmPcRIjDqWub67kTKuIMx43cZZrS/cBBzwBcNDWoFxt2XEFIpQ==" + }, + "source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==" + }, + "statuses": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.4.0.tgz", + "integrity": "sha512-zhSCtt8v2NDrRlPQpCNtw/heZLtfUDqxBM1udqikb/Hbk52LK4nQSwr10u77iopCW5LsyHpuXS0GnEc48mLeew==" + }, + "type-is": { + "version": "1.6.16", + "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.16.tgz", + "integrity": "sha512-HRkVv/5qY2G6I8iab9cI7v1bOIdhm94dVjQCPFElW9W+3GeDOSHmy2EBYe4VTApuzolPcmgFTN3ftVJRKR2J9Q==", + "requires": { + "media-typer": "0.3.0", + "mime-types": "~2.1.18" + } + }, + "uglify-js": { + "version": "3.5.9", + "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.5.9.tgz", + "integrity": "sha512-WpT0RqsDtAWPNJK955DEnb6xjymR8Fn0OlK4TT4pS0ASYsVPqr5ELhgwOwLCP5J5vHeJ4xmMmz3DEgdqC10JeQ==", + "optional": true, + "requires": { + "commander": "~2.20.0", + "source-map": "~0.6.1" + } + }, + "unpipe": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", + "integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw=" + }, + "utils-merge": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", + "integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM=" + }, + "vary": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", + "integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw=" + }, + "walk": { + "version": "2.3.9", + "resolved": "https://registry.npmjs.org/walk/-/walk-2.3.9.tgz", + "integrity": "sha1-MbTbZnjyrgHDnqn7hyWpAx5Vins=", + "requires": { + "foreachasync": "^3.0.0" + } + }, + "wordwrap": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz", + "integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=" + } + } +} diff --git a/benchmarks/workloads/node_template/package.json b/benchmarks/workloads/node_template/package.json new file mode 100644 index 000000000..7dcadd523 --- /dev/null +++ b/benchmarks/workloads/node_template/package.json @@ -0,0 +1,19 @@ +{ + "name": "nodedum", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "", + "license": "ISC", + "dependencies": { + "express": "^4.16.4", + "hbs": "^4.0.4", + "redis": "^2.8.0", + "redis-commands": "^1.2.0", + "redis-parser": "^2.6.0", + "secure-random-string": "^1.1.0" + } +} diff --git a/benchmarks/workloads/redis/BUILD b/benchmarks/workloads/redis/BUILD new file mode 100644 index 000000000..83f3c71a0 --- /dev/null +++ b/benchmarks/workloads/redis/BUILD @@ -0,0 +1,11 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/redis/Dockerfile b/benchmarks/workloads/redis/Dockerfile new file mode 100644 index 000000000..0f17249af --- /dev/null +++ b/benchmarks/workloads/redis/Dockerfile @@ -0,0 +1 @@ +FROM redis:5.0.4 diff --git a/benchmarks/workloads/redisbenchmark/BUILD b/benchmarks/workloads/redisbenchmark/BUILD new file mode 100644 index 000000000..d40e75a3a --- /dev/null +++ b/benchmarks/workloads/redisbenchmark/BUILD @@ -0,0 +1,35 @@ +load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") + +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "redisbenchmark", + srcs = ["__init__.py"], +) + +py_test( + name = "redisbenchmark_test", + srcs = ["redisbenchmark_test.py"], + python_version = "PY3", + deps = [ + ":redisbenchmark", + requirement("attrs", False), + requirement("atomicwrites", False), + requirement("more-itertools", False), + requirement("pathlib2", False), + requirement("pluggy", False), + requirement("py", False), + requirement("pytest", True), + requirement("six", False), + ], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/redisbenchmark/Dockerfile b/benchmarks/workloads/redisbenchmark/Dockerfile new file mode 100644 index 000000000..f94f6442e --- /dev/null +++ b/benchmarks/workloads/redisbenchmark/Dockerfile @@ -0,0 +1,4 @@ +FROM redis:5.0.4 +ENV host localhost +ENV port 6379 +CMD ["sh", "-c", "redis-benchmark --csv -h ${host} -p ${port} ${flags}"] diff --git a/benchmarks/workloads/redisbenchmark/__init__.py b/benchmarks/workloads/redisbenchmark/__init__.py new file mode 100644 index 000000000..229cef5fa --- /dev/null +++ b/benchmarks/workloads/redisbenchmark/__init__.py @@ -0,0 +1,85 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Redis-benchmark tool.""" + +import re + +OPERATIONS = [ + "PING_INLINE", + "PING_BULK", + "SET", + "GET", + "INCR", + "LPUSH", + "RPUSH", + "LPOP", + "RPOP", + "SADD", + "HSET", + "SPOP", + "LRANGE_100", + "LRANGE_300", + "LRANGE_500", + "LRANGE_600", + "MSET", +] + +METRICS = dict() + +SAMPLE_DATA = """ +"PING_INLINE","48661.80" +"PING_BULK","50301.81" +"SET","48923.68" +"GET","49382.71" +"INCR","49975.02" +"LPUSH","49875.31" +"RPUSH","50276.52" +"LPOP","50327.12" +"RPOP","50556.12" +"SADD","49504.95" +"HSET","49504.95" +"SPOP","50025.02" +"LPUSH (needed to benchmark LRANGE)","48875.86" +"LRANGE_100 (first 100 elements)","33955.86" +"LRANGE_300 (first 300 elements)","16550.81" +"LRANGE_500 (first 450 elements)","13653.74" +"LRANGE_600 (first 600 elements)","11219.57" +"MSET (10 keys)","44682.75" +""" + + +# pylint: disable=unused-argument +def sample(**kwargs) -> str: + return SAMPLE_DATA + + +# Bind a metric for each operation noted above. +for op in OPERATIONS: + + def bind(metric): + """Bind op to a new scope.""" + + # pylint: disable=unused-argument + def parse(data: str, **kwargs) -> float: + """Operation throughput in requests/sec.""" + regex = r"\"" + metric + r"( .*)?\",\"(\d*.\d*)" + res = re.compile(regex).search(data) + if res: + return float(res.group(2)) + return 0.0 + + parse.__name__ = metric + return parse + + METRICS[op] = bind(op) diff --git a/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py b/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py new file mode 100644 index 000000000..419ced059 --- /dev/null +++ b/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py @@ -0,0 +1,51 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Parser test.""" + +import sys + +import pytest + +from benchmarks.workloads import redisbenchmark + +RESULTS = { + "PING_INLINE": 48661.80, + "PING_BULK": 50301.81, + "SET": 48923.68, + "GET": 49382.71, + "INCR": 49975.02, + "LPUSH": 49875.31, + "RPUSH": 50276.52, + "LPOP": 50327.12, + "RPOP": 50556.12, + "SADD": 49504.95, + "HSET": 49504.95, + "SPOP": 50025.02, + "LRANGE_100": 33955.86, + "LRANGE_300": 16550.81, + "LRANGE_500": 13653.74, + "LRANGE_600": 11219.57, + "MSET": 44682.75 +} + + +def test_metrics(): + """Test all metrics.""" + for (metric, func) in redisbenchmark.METRICS.items(): + res = func(redisbenchmark.sample()) + assert float(res) == RESULTS[metric] + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/ruby/BUILD b/benchmarks/workloads/ruby/BUILD new file mode 100644 index 000000000..9846c7e70 --- /dev/null +++ b/benchmarks/workloads/ruby/BUILD @@ -0,0 +1,15 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + "Gemfile", + "Gemfile.lock", + "config.ru", + "index.rb", + ], +) diff --git a/benchmarks/workloads/ruby/Dockerfile b/benchmarks/workloads/ruby/Dockerfile new file mode 100644 index 000000000..a9a7a7086 --- /dev/null +++ b/benchmarks/workloads/ruby/Dockerfile @@ -0,0 +1,28 @@ +# example based on https://github.com/errm/fib + +FROM ruby:2.5 + +RUN apt-get update -qq && apt-get install -y build-essential libpq-dev nodejs libsodium-dev + +# Set an environment variable where the Rails app is installed to inside of Docker image +ENV RAILS_ROOT /var/www/app_name +RUN mkdir -p $RAILS_ROOT + +# Set working directory +WORKDIR $RAILS_ROOT + +# Setting env up +ENV RAILS_ENV='production' +ENV RACK_ENV='production' + +# Adding gems +COPY Gemfile Gemfile +COPY Gemfile.lock Gemfile.lock +RUN bundle install --jobs 20 --retry 5 --without development test + +# Adding project files +COPY . . + +EXPOSE $PORT +STOPSIGNAL SIGINT +CMD ["bundle", "exec", "puma", "config.ru"] diff --git a/benchmarks/workloads/ruby/Gemfile b/benchmarks/workloads/ruby/Gemfile new file mode 100644 index 000000000..8f1bdad6e --- /dev/null +++ b/benchmarks/workloads/ruby/Gemfile @@ -0,0 +1,12 @@ +source "https://rubygems.org" +# load a bunch of dependencies to take up memory +gem "sinatra" +gem "puma" +gem "redis" +gem 'rake' +gem 'squid', '~> 1.4' +gem 'cassandra-driver' +gem 'ruby-fann' +gem 'rbnacl' +gem 'bcrypt' +gem "activemerchant"
\ No newline at end of file diff --git a/benchmarks/workloads/ruby/Gemfile.lock b/benchmarks/workloads/ruby/Gemfile.lock new file mode 100644 index 000000000..b44817bd3 --- /dev/null +++ b/benchmarks/workloads/ruby/Gemfile.lock @@ -0,0 +1,55 @@ +GEM + remote: https://rubygems.org/ + specs: + activesupport (5.2.3) + concurrent-ruby (~> 1.0, >= 1.0.2) + i18n (>= 0.7, < 2) + minitest (~> 5.1) + tzinfo (~> 1.1) + cassandra-driver (3.2.3) + ione (~> 1.2) + concurrent-ruby (1.1.5) + i18n (1.6.0) + concurrent-ruby (~> 1.0) + ione (1.2.4) + minitest (5.11.3) + mustermann (1.0.3) + pdf-core (0.7.0) + prawn (2.2.2) + pdf-core (~> 0.7.0) + ttfunk (~> 1.5) + puma (3.12.1) + rack (2.0.7) + rack-protection (2.0.5) + rack + rake (12.3.2) + redis (4.1.1) + ruby-fann (1.2.6) + sinatra (2.0.5) + mustermann (~> 1.0) + rack (~> 2.0) + rack-protection (= 2.0.5) + tilt (~> 2.0) + squid (1.4.1) + activesupport (>= 4.0) + prawn (~> 2.2) + thread_safe (0.3.6) + tilt (2.0.9) + ttfunk (1.5.1) + tzinfo (1.2.5) + thread_safe (~> 0.1) + +PLATFORMS + ruby + +DEPENDENCIES + cassandra-driver + puma + rake + redis + ruby-fann + sinatra + squid (~> 1.4) + +BUNDLED WITH + 1.17.1 diff --git a/benchmarks/workloads/ruby/config.ru b/benchmarks/workloads/ruby/config.ru new file mode 100755 index 000000000..fbd5acc82 --- /dev/null +++ b/benchmarks/workloads/ruby/config.ru @@ -0,0 +1,2 @@ +require './index' +run Sinatra::Application
\ No newline at end of file diff --git a/benchmarks/workloads/ruby/index.rb b/benchmarks/workloads/ruby/index.rb new file mode 100755 index 000000000..5fa85af93 --- /dev/null +++ b/benchmarks/workloads/ruby/index.rb @@ -0,0 +1,14 @@ +require "sinatra" +require "puma" +require "redis" +require "rake" +require "squid" +require "cassandra" +require "ruby-fann" +require "rbnacl" +require "bcrypt" +require "activemerchant" + +get "/" do + "Hello World!" +end
\ No newline at end of file diff --git a/benchmarks/workloads/ruby_template/BUILD b/benchmarks/workloads/ruby_template/BUILD new file mode 100644 index 000000000..2b99892af --- /dev/null +++ b/benchmarks/workloads/ruby_template/BUILD @@ -0,0 +1,16 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + "Gemfile", + "Gemfile.lock", + "config.ru", + "index.erb", + "main.rb", + ], +) diff --git a/benchmarks/workloads/ruby_template/Dockerfile b/benchmarks/workloads/ruby_template/Dockerfile new file mode 100755 index 000000000..a06d68bf4 --- /dev/null +++ b/benchmarks/workloads/ruby_template/Dockerfile @@ -0,0 +1,38 @@ +# example based on https://github.com/errm/fib + +FROM alpine:3.9 as build + +COPY Gemfile Gemfile.lock ./ + +RUN apk add --no-cache ruby ruby-dev ruby-bundler ruby-json build-base bash \ + && bundle install --frozen -j4 -r3 --no-cache --without development \ + && apk del --no-cache ruby-bundler \ + && rm -rf /usr/lib/ruby/gems/*/cache + +FROM alpine:3.9 as prod + +COPY --from=build /usr/lib/ruby/gems /usr/lib/ruby/gems +RUN apk add --no-cache ruby ruby-json ruby-etc redis apache2-utils \ + && ruby -e "Gem::Specification.map.each do |spec| \ + Gem::Installer.for_spec( \ + spec, \ + wrappers: true, \ + force: true, \ + install_dir: spec.base_dir, \ + build_args: spec.build_args, \ + ).generate_bin \ + end" + +WORKDIR /app +COPY . /app/. + +ENV PORT=9292 \ + WEB_CONCURRENCY=20 \ + WEB_MAX_THREADS=20 \ + RACK_ENV=production + +ENV host localhost +EXPOSE $PORT +USER nobody +STOPSIGNAL SIGINT +CMD ["sh", "-c", "/usr/bin/puma", "${host}"] diff --git a/benchmarks/workloads/ruby_template/Gemfile b/benchmarks/workloads/ruby_template/Gemfile new file mode 100755 index 000000000..ac521b32c --- /dev/null +++ b/benchmarks/workloads/ruby_template/Gemfile @@ -0,0 +1,5 @@ +source "https://rubygems.org" + +gem "sinatra" +gem "puma" +gem "redis"
\ No newline at end of file diff --git a/benchmarks/workloads/ruby_template/Gemfile.lock b/benchmarks/workloads/ruby_template/Gemfile.lock new file mode 100644 index 000000000..dd8d56fb7 --- /dev/null +++ b/benchmarks/workloads/ruby_template/Gemfile.lock @@ -0,0 +1,26 @@ +GEM + remote: https://rubygems.org/ + specs: + mustermann (1.0.3) + puma (3.12.0) + rack (2.0.6) + rack-protection (2.0.5) + rack + sinatra (2.0.5) + mustermann (~> 1.0) + rack (~> 2.0) + rack-protection (= 2.0.5) + tilt (~> 2.0) + tilt (2.0.9) + redis (4.1.0) + +PLATFORMS + ruby + +DEPENDENCIES + puma + sinatra + redis + +BUNDLED WITH + 1.17.1
\ No newline at end of file diff --git a/benchmarks/workloads/ruby_template/config.ru b/benchmarks/workloads/ruby_template/config.ru new file mode 100755 index 000000000..b2d135cc0 --- /dev/null +++ b/benchmarks/workloads/ruby_template/config.ru @@ -0,0 +1,2 @@ +require './main' +run Sinatra::Application
\ No newline at end of file diff --git a/benchmarks/workloads/ruby_template/index.erb b/benchmarks/workloads/ruby_template/index.erb new file mode 100755 index 000000000..7f7300e80 --- /dev/null +++ b/benchmarks/workloads/ruby_template/index.erb @@ -0,0 +1,8 @@ +<!DOCTYPE html> +<html> +<body> + <% text.each do |t| %> + <p><%= t %></p> + <% end %> +</body> +</html> diff --git a/benchmarks/workloads/ruby_template/main.rb b/benchmarks/workloads/ruby_template/main.rb new file mode 100755 index 000000000..35c239377 --- /dev/null +++ b/benchmarks/workloads/ruby_template/main.rb @@ -0,0 +1,27 @@ +require "sinatra" +require "securerandom" +require "redis" + +redis_host = ENV["host"] +$redis = Redis.new(host: redis_host) + +def generateText + for i in 0..99 + $redis.set(i, randomBody(1024)) + end +end + +def randomBody(length) + return SecureRandom.alphanumeric(length) +end + +generateText +template = ERB.new(File.read('./index.erb')) + +get "/" do + texts = Array.new + for i in 0..4 + texts.push($redis.get(rand(0..99))) + end + template.result_with_hash(text: texts) +end
\ No newline at end of file diff --git a/benchmarks/workloads/sleep/BUILD b/benchmarks/workloads/sleep/BUILD new file mode 100644 index 000000000..83f3c71a0 --- /dev/null +++ b/benchmarks/workloads/sleep/BUILD @@ -0,0 +1,11 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/sleep/Dockerfile b/benchmarks/workloads/sleep/Dockerfile new file mode 100644 index 000000000..24c72e07a --- /dev/null +++ b/benchmarks/workloads/sleep/Dockerfile @@ -0,0 +1,3 @@ +FROM alpine:latest + +CMD ["sleep", "315360000"] diff --git a/benchmarks/workloads/sysbench/BUILD b/benchmarks/workloads/sysbench/BUILD new file mode 100644 index 000000000..35f4d460b --- /dev/null +++ b/benchmarks/workloads/sysbench/BUILD @@ -0,0 +1,35 @@ +load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") + +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "sysbench", + srcs = ["__init__.py"], +) + +py_test( + name = "sysbench_test", + srcs = ["sysbench_test.py"], + python_version = "PY3", + deps = [ + ":sysbench", + requirement("attrs", False), + requirement("atomicwrites", False), + requirement("more-itertools", False), + requirement("pathlib2", False), + requirement("pluggy", False), + requirement("py", False), + requirement("pytest", True), + requirement("six", False), + ], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/sysbench/Dockerfile b/benchmarks/workloads/sysbench/Dockerfile new file mode 100644 index 000000000..8225e0e14 --- /dev/null +++ b/benchmarks/workloads/sysbench/Dockerfile @@ -0,0 +1,16 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + sysbench \ + && rm -rf /var/lib/apt/lists/* + +# Parameterize the tests. +ENV test cpu +ENV threads 1 +ENV options "" + +# run sysbench once as a warm-up and take the second result +CMD ["sh", "-c", "sysbench --threads=8 --memory-total-size=5G memory run > /dev/null && \ +sysbench --threads=${threads} ${options} ${test} run"] diff --git a/benchmarks/workloads/sysbench/__init__.py b/benchmarks/workloads/sysbench/__init__.py new file mode 100644 index 000000000..de357b4db --- /dev/null +++ b/benchmarks/workloads/sysbench/__init__.py @@ -0,0 +1,167 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Sysbench.""" + +import re + +STD_REGEX = r"events per second:\s*(\d*.?\d*)\n" +MEM_REGEX = r"Total\soperations:\s+\d*\s*\((\d*\.\d*)\sper\ssecond\)" +ALT_REGEX = r"execution time \(avg/stddev\):\s*(\d*.?\d*)/(\d*.?\d*)" +AVG_REGEX = r"avg:[^\n^\d]*(\d*\.?\d*)" + +SAMPLE_CPU_DATA = """ +sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3) + +Running the test with following options: +Number of threads: 8 +Initializing random number generator from current time + + +Prime numbers limit: 10000 + +Initializing worker threads... + +Threads started! + +CPU speed: + events per second: 9093.38 + +General statistics: + total time: 10.0007s + total number of events: 90949 + +Latency (ms): + min: 0.64 + avg: 0.88 + max: 24.65 + 95th percentile: 1.55 + sum: 79936.91 + +Threads fairness: + events (avg/stddev): 11368.6250/831.38 + execution time (avg/stddev): 9.9921/0.01 +""" + +SAMPLE_MEMORY_DATA = """ +sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3) + +Running the test with following options: +Number of threads: 8 +Initializing random number generator from current time + + +Running memory speed test with the following options: + block size: 1KiB + total size: 102400MiB + operation: write + scope: global + +Initializing worker threads... + +Threads started! + +Total operations: 47999046 (9597428.64 per second) + +46874.07 MiB transferred (9372.49 MiB/sec) + + +General statistics: + total time: 5.0001s + total number of events: 47999046 + +Latency (ms): + min: 0.00 + avg: 0.00 + max: 0.21 + 95th percentile: 0.00 + sum: 33165.91 + +Threads fairness: + events (avg/stddev): 5999880.7500/111242.52 + execution time (avg/stddev): 4.1457/0.09 +""" + +SAMPLE_MUTEX_DATA = """ +sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3) + +Running the test with following options: +Number of threads: 8 +Initializing random number generator from current time + + +Initializing worker threads... + +Threads started! + + +General statistics: + total time: 3.7869s + total number of events: 8 + +Latency (ms): + min: 3688.56 + avg: 3754.03 + max: 3780.94 + 95th percentile: 3773.42 + sum: 30032.28 + +Threads fairness: + events (avg/stddev): 1.0000/0.00 + execution time (avg/stddev): 3.7540/0.03 +""" + + +# pylint: disable=unused-argument +def sample(test, **kwargs): + switch = { + "cpu": SAMPLE_CPU_DATA, + "memory": SAMPLE_MEMORY_DATA, + "mutex": SAMPLE_MUTEX_DATA, + "randwr": SAMPLE_CPU_DATA + } + return switch[test] + + +# pylint: disable=unused-argument +def cpu_events_per_second(data: str, **kwargs) -> float: + """Returns events per second.""" + return float(re.compile(STD_REGEX).search(data).group(1)) + + +# pylint: disable=unused-argument +def memory_ops_per_second(data: str, **kwargs) -> float: + """Returns memory operations per second.""" + return float(re.compile(MEM_REGEX).search(data).group(1)) + + +# pylint: disable=unused-argument +def mutex_time(data: str, count: int, locks: int, threads: int, + **kwargs) -> float: + """Returns normalized mutex time (lower is better).""" + value = float(re.compile(ALT_REGEX).search(data).group(1)) + contention = float(threads) / float(locks) + scale = contention * float(count) / 100000000.0 + return value / scale + + +# pylint: disable=unused-argument +def mutex_deviation(data: str, **kwargs) -> float: + """Returns deviation for threads.""" + return float(re.compile(ALT_REGEX).search(data).group(2)) + + +# pylint: disable=unused-argument +def mutex_latency(data: str, **kwargs) -> float: + """Returns average mutex latency.""" + return float(re.compile(AVG_REGEX).search(data).group(1)) diff --git a/benchmarks/workloads/sysbench/sysbench_test.py b/benchmarks/workloads/sysbench/sysbench_test.py new file mode 100644 index 000000000..3fb541fd2 --- /dev/null +++ b/benchmarks/workloads/sysbench/sysbench_test.py @@ -0,0 +1,34 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Parser test.""" + +import sys + +import pytest + +from benchmarks.workloads import sysbench + + +def test_sysbench_parser(): + """Test the basic parser.""" + assert sysbench.cpu_events_per_second(sysbench.sample("cpu")) == 9093.38 + assert sysbench.memory_ops_per_second(sysbench.sample("memory")) == 9597428.64 + assert sysbench.mutex_time(sysbench.sample("mutex"), 1, 1, + 100000000.0) == 3.754 + assert sysbench.mutex_deviation(sysbench.sample("mutex")) == 0.03 + assert sysbench.mutex_latency(sysbench.sample("mutex")) == 3754.03 + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/syscall/BUILD b/benchmarks/workloads/syscall/BUILD new file mode 100644 index 000000000..e1ff3059b --- /dev/null +++ b/benchmarks/workloads/syscall/BUILD @@ -0,0 +1,36 @@ +load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") + +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "syscall", + srcs = ["__init__.py"], +) + +py_test( + name = "syscall_test", + srcs = ["syscall_test.py"], + python_version = "PY3", + deps = [ + ":syscall", + requirement("attrs", False), + requirement("atomicwrites", False), + requirement("more-itertools", False), + requirement("pathlib2", False), + requirement("pluggy", False), + requirement("py", False), + requirement("pytest", True), + requirement("six", False), + ], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + "syscall.c", + ], +) diff --git a/benchmarks/workloads/syscall/Dockerfile b/benchmarks/workloads/syscall/Dockerfile new file mode 100644 index 000000000..a2088d953 --- /dev/null +++ b/benchmarks/workloads/syscall/Dockerfile @@ -0,0 +1,6 @@ +FROM gcc:latest +COPY . /usr/src/syscall +WORKDIR /usr/src/syscall +RUN gcc -O2 -o syscall syscall.c +ENV count 1000000 +CMD ["sh", "-c", "./syscall ${count}"] diff --git a/benchmarks/workloads/syscall/__init__.py b/benchmarks/workloads/syscall/__init__.py new file mode 100644 index 000000000..dc9028faa --- /dev/null +++ b/benchmarks/workloads/syscall/__init__.py @@ -0,0 +1,29 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Simple syscall test.""" + +import re + +SAMPLE_DATA = "Called getpid syscall 1000000 times: 1117 ms, 500 ns each." + + +# pylint: disable=unused-argument +def sample(**kwargs) -> str: + return SAMPLE_DATA + + +# pylint: disable=unused-argument +def syscall_time_ns(data: str, **kwargs) -> int: + """Returns average system call time.""" + return float(re.compile(r"(\d+)\sns each.").search(data).group(1)) diff --git a/benchmarks/workloads/syscall/syscall.c b/benchmarks/workloads/syscall/syscall.c new file mode 100644 index 000000000..ded030397 --- /dev/null +++ b/benchmarks/workloads/syscall/syscall.c @@ -0,0 +1,55 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> + +// Short program that calls getpid() a number of times and outputs time +// diference from the MONOTONIC clock. +int main(int argc, char** argv) { + struct timespec start, stop; + long result; + char buf[80]; + + if (argc < 2) { + printf("Usage:./syscall NUM_TIMES_TO_CALL"); + return 1; + } + + if (clock_gettime(CLOCK_MONOTONIC, &start)) return 1; + + long loops = atoi(argv[1]); + for (long i = 0; i < loops; i++) { + syscall(SYS_gettimeofday, 0, 0); + } + + if (clock_gettime(CLOCK_MONOTONIC, &stop)) return 1; + + if ((stop.tv_nsec - start.tv_nsec) < 0) { + result = (stop.tv_sec - start.tv_sec - 1) * 1000; + result += (stop.tv_nsec - start.tv_nsec + 1000000000) / (1000 * 1000); + } else { + result = (stop.tv_sec - start.tv_sec) * 1000; + result += (stop.tv_nsec - start.tv_nsec) / (1000 * 1000); + } + + printf("Called getpid syscall %d times: %lu ms, %lu ns each.\n", loops, + result, result * 1000000 / loops); + + return 0; +} diff --git a/benchmarks/workloads/syscall/syscall_test.py b/benchmarks/workloads/syscall/syscall_test.py new file mode 100644 index 000000000..72f027de1 --- /dev/null +++ b/benchmarks/workloads/syscall/syscall_test.py @@ -0,0 +1,27 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import pytest + +from benchmarks.workloads import syscall + + +def test_syscall_time_ns(): + assert syscall.syscall_time_ns(syscall.sample()) == 500 + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/benchmarks/workloads/tensorflow/BUILD b/benchmarks/workloads/tensorflow/BUILD new file mode 100644 index 000000000..17f1f8ebb --- /dev/null +++ b/benchmarks/workloads/tensorflow/BUILD @@ -0,0 +1,16 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +py_library( + name = "tensorflow", + srcs = ["__init__.py"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/tensorflow/Dockerfile b/benchmarks/workloads/tensorflow/Dockerfile new file mode 100644 index 000000000..262643b98 --- /dev/null +++ b/benchmarks/workloads/tensorflow/Dockerfile @@ -0,0 +1,14 @@ +FROM tensorflow/tensorflow:1.13.2 + +RUN apt-get update \ + && apt-get install -y git +RUN git clone https://github.com/aymericdamien/TensorFlow-Examples.git +RUN python -m pip install -U pip setuptools +RUN python -m pip install matplotlib + +WORKDIR /TensorFlow-Examples/examples + +ENV PYTHONPATH="$PYTHONPATH:/TensorFlow-Examples/examples" + +ENV workload "3_NeuralNetworks/convolutional_network.py" +CMD python ${workload} diff --git a/benchmarks/workloads/tensorflow/__init__.py b/benchmarks/workloads/tensorflow/__init__.py new file mode 100644 index 000000000..b5ec213f8 --- /dev/null +++ b/benchmarks/workloads/tensorflow/__init__.py @@ -0,0 +1,20 @@ +# python3 +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A Tensorflow example.""" + + +# pylint: disable=unused-argument +def run_time(value, **kwargs): + """Returns the startup and runtime of the Tensorflow workload in seconds.""" + return value diff --git a/benchmarks/workloads/true/BUILD b/benchmarks/workloads/true/BUILD new file mode 100644 index 000000000..83f3c71a0 --- /dev/null +++ b/benchmarks/workloads/true/BUILD @@ -0,0 +1,11 @@ +package( + default_visibility = ["//benchmarks:__subpackages__"], + licenses = ["notice"], +) + +filegroup( + name = "files", + srcs = [ + "Dockerfile", + ], +) diff --git a/benchmarks/workloads/true/Dockerfile b/benchmarks/workloads/true/Dockerfile new file mode 100644 index 000000000..2e97c921e --- /dev/null +++ b/benchmarks/workloads/true/Dockerfile @@ -0,0 +1,3 @@ +FROM alpine:latest + +CMD ["true"] @@ -1,21 +1,21 @@ module gvisor.dev/gvisor -go 1.12 +go 1.13 require ( - github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422 - github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079 - github.com/golang/mock v1.3.1 - github.com/golang/protobuf v1.3.1 - github.com/google/btree v1.0.0 - github.com/google/go-cmp v0.2.0 - github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8 - github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d - github.com/kr/pty v1.1.1 - github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78 - github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2 - github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e - github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 - golang.org/x/net v0.0.0-20190311183353-d8887717615a - golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a + github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422 + github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079 + github.com/golang/mock v1.3.1 + github.com/golang/protobuf v1.3.1 + github.com/google/btree v1.0.0 + github.com/google/go-cmp v0.2.0 + github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8 + github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d + github.com/kr/pty v1.1.1 + github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78 + github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2 + github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e + github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 + golang.org/x/net v0.0.0-20190311183353-d8887717615a + golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a ) diff --git a/kokoro/iptables_tests.cfg b/kokoro/iptables_tests.cfg new file mode 100644 index 000000000..7af20629a --- /dev/null +++ b/kokoro/iptables_tests.cfg @@ -0,0 +1,10 @@ +build_file: "repo/scripts/iptables_test.sh" + +action { + define_artifacts { + regex: "**/sponge_log.xml" + regex: "**/sponge_log.log" + regex: "**/outputs.zip" + regex: "**/runsc_logs_*.tar.gz" + } +} diff --git a/kokoro/kythe/generate_xrefs.cfg b/kokoro/kythe/generate_xrefs.cfg new file mode 100644 index 000000000..03e65c54e --- /dev/null +++ b/kokoro/kythe/generate_xrefs.cfg @@ -0,0 +1,28 @@ +build_file: "gvisor/kokoro/kythe/generate_xrefs.sh" + +before_action { + fetch_keystore { + keystore_resource { + keystore_config_id: 73898 + keyname: "kokoro-rbe-service-account" + } + } +} + +bazel_setting { + project_id: "gvisor-rbe" + local_execution: false + auth_credential: { + keystore_config_id: 73898 + keyname: "kokoro-rbe-service-account" + } + bes_backend_address: "buildeventservice.googleapis.com" + foundry_backend_address: "remotebuildexecution.googleapis.com" + upsalite_frontend_address: "https://source.cloud.google.com" +} + +action { + define_artifacts { + regex: "*.kzip" + } +} diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh new file mode 100644 index 000000000..799467a34 --- /dev/null +++ b/kokoro/kythe/generate_xrefs.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -ex + +# Install the latest version of Bazel. The default on Kokoro images is out of +# date. +if command -v use_bazel.sh >/dev/null; then + use_bazel.sh latest +fi +bazel version + +python3 -V + +readonly KYTHE_VERSION='v0.0.37' +readonly WORKDIR="$(mktemp -d)" +readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}" +if [[ -n "$KOKORO_GIT_COMMIT" ]]; then + readonly KZIP_FILENAME="${KOKORO_ARTIFACTS_DIR}/${KOKORO_GIT_COMMIT}.kzip" +else + readonly KZIP_FILENAME="$(git rev-parse HEAD).kzip" +fi + +wget -q -O "${WORKDIR}/kythe.tar.gz" \ + "https://github.com/kythe/kythe/releases/download/${KYTHE_VERSION}/kythe-${KYTHE_VERSION}.tar.gz" +tar --no-same-owner -xzf "${WORKDIR}/kythe.tar.gz" --directory "$WORKDIR" + +if [[ -n "$KOKORO_ARTIFACTS_DIR" ]]; then + cd "${KOKORO_ARTIFACTS_DIR}/github/gvisor" +fi +bazel \ + --bazelrc="${KYTHE_DIR}/extractors.bazelrc" \ + build \ + --override_repository kythe_release="${KYTHE_DIR}" \ + --define=kythe_corpus=gvisor.dev \ + --cxxopt=-std=c++17 \ + //... + +"${KYTHE_DIR}/tools/kzip" merge \ + --output "$KZIP_FILENAME" \ + $(find -L bazel-out/*/extra_actions/ -name '*.kzip') diff --git a/kokoro/ubuntu1604/10_core.sh b/kokoro/ubuntu1604/10_core.sh index e87a6eee8..46dda6bb1 100755 --- a/kokoro/ubuntu1604/10_core.sh +++ b/kokoro/ubuntu1604/10_core.sh @@ -21,8 +21,8 @@ apt-get update && apt-get -y install make git-core build-essential linux-headers # Install a recent go toolchain. if ! [[ -d /usr/local/go ]]; then - wget https://dl.google.com/go/go1.12.linux-amd64.tar.gz - tar -xvf go1.12.linux-amd64.tar.gz + wget https://dl.google.com/go/go1.13.5.linux-amd64.tar.gz + tar -xvf go1.13.5.linux-amd64.tar.gz mv go /usr/local fi diff --git a/kokoro/ubuntu1604/40_kokoro.sh b/kokoro/ubuntu1604/40_kokoro.sh index 3f50929d5..5f2dfc858 100755 --- a/kokoro/ubuntu1604/40_kokoro.sh +++ b/kokoro/ubuntu1604/40_kokoro.sh @@ -23,7 +23,7 @@ declare -r ssh_public_keys=( ) # Install dependencies. -apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip zip +apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip python3-pip zip # junitparser is used to merge junit xml files. pip install junitparser diff --git a/kokoro/ubuntu1604/README.md b/kokoro/ubuntu1604/README.md new file mode 100644 index 000000000..64f913b9a --- /dev/null +++ b/kokoro/ubuntu1604/README.md @@ -0,0 +1,34 @@ +## Image Update + +After making changes to files in the directory, you must run the following +commands to update the image Kokoro uses: + +```shell +gcloud config set project gvisor-kokoro-testing +third_party/gvisor/kokoro/ubuntu1604/build.sh +third_party/gvisor/kokoro/ubuntu1804/build.sh +``` + +Note: the command above will change your default project for `gcloud`. Run +`gcloud config set project` again to revert back to your default project. + +Note: Files in `third_party/gvisor/kokoro/ubuntu1804/` as symlinks to +`ubuntu1604`, therefore both images must be updated. + +After the script finishes, the last few lines of the output will container the +image name. If the output was lost, you can run `build.sh` again to print the +image name. + +``` +NAME PROJECT FAMILY DEPRECATED STATUS +image-6777fa4666a968c8 gvisor-kokoro-testing READY ++ cleanup ++ gcloud compute instances delete --quiet build-tlfrdv +Deleted [https://www.googleapis.com/compute/v1/projects/gvisor-kokoro-testing/zones/us-central1-f/instances/build-tlfrdv]. +``` + +To setup Kokoro to use the new image, copy the image names to their +corresponding file below: + +* //devtools/kokoro/config/gcp/gvisor/ubuntu1604.gcl +* //devtools/kokoro/config/gcp/gvisor/ubuntu1804.gcl diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 51774c6b6..716ff22d2 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -41,6 +41,7 @@ go_library( "poll.go", "prctl.go", "ptrace.go", + "rseq.go", "rusage.go", "sched.go", "seccomp.go", @@ -57,6 +58,7 @@ go_library( "uio.go", "utsname.go", "wait.go", + "xattr.go", ], importpath = "gvisor.dev/gvisor/pkg/abi/linux", visibility = ["//visibility:public"], diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go index f78315ebf..6663a199c 100644 --- a/pkg/abi/linux/fcntl.go +++ b/pkg/abi/linux/fcntl.go @@ -16,15 +16,17 @@ package linux // Commands from linux/fcntl.h. const ( - F_DUPFD = 0x0 - F_GETFD = 0x1 - F_SETFD = 0x2 - F_GETFL = 0x3 - F_SETFL = 0x4 - F_SETLK = 0x6 - F_SETLKW = 0x7 - F_SETOWN = 0x8 - F_GETOWN = 0x9 + F_DUPFD = 0 + F_GETFD = 1 + F_SETFD = 2 + F_GETFL = 3 + F_SETFL = 4 + F_SETLK = 6 + F_SETLKW = 7 + F_SETOWN = 8 + F_GETOWN = 9 + F_SETOWN_EX = 15 + F_GETOWN_EX = 16 F_DUPFD_CLOEXEC = 1024 + 6 F_SETPIPE_SZ = 1024 + 7 F_GETPIPE_SZ = 1024 + 8 @@ -32,9 +34,9 @@ const ( // Commands for F_SETLK. const ( - F_RDLCK = 0x0 - F_WRLCK = 0x1 - F_UNLCK = 0x2 + F_RDLCK = 0 + F_WRLCK = 1 + F_UNLCK = 2 ) // Flags for fcntl. @@ -42,7 +44,7 @@ const ( FD_CLOEXEC = 00000001 ) -// Lock structure for F_SETLK. +// Flock is the lock structure for F_SETLK. type Flock struct { Type int16 Whence int16 @@ -52,3 +54,16 @@ type Flock struct { Pid int32 _ [4]byte } + +// Flags for F_SETOWN_EX and F_GETOWN_EX. +const ( + F_OWNER_TID = 0 + F_OWNER_PID = 1 + F_OWNER_PGRP = 2 +) + +// FOwnerEx is the owner structure for F_SETOWN_EX and F_GETOWN_EX. +type FOwnerEx struct { + Type int32 + PID int32 +} diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index c9ee098f4..16791d03e 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -144,9 +144,13 @@ const ( ModeCharacterDevice = S_IFCHR ModeNamedPipe = S_IFIFO - ModeSetUID = 04000 - ModeSetGID = 02000 - ModeSticky = 01000 + S_ISUID = 04000 + S_ISGID = 02000 + S_ISVTX = 01000 + + ModeSetUID = S_ISUID + ModeSetGID = S_ISGID + ModeSticky = S_ISVTX ModeUserAll = 0700 ModeUserRead = 0400 @@ -282,6 +286,29 @@ func (m FileMode) String() string { return strings.Join(s, "|") } +// DirentType maps file types to dirent types appropriate for (struct +// dirent)::d_type. +func (m FileMode) DirentType() uint8 { + switch m.FileType() { + case ModeSocket: + return DT_SOCK + case ModeSymlink: + return DT_LNK + case ModeRegular: + return DT_REG + case ModeBlockDevice: + return DT_BLK + case ModeDirectory: + return DT_DIR + case ModeCharacterDevice: + return DT_CHR + case ModeNamedPipe: + return DT_FIFO + default: + return DT_UNKNOWN + } +} + var modeExtraBits = abi.FlagSet{ { Flag: ModeSetUID, diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index b416e3472..2c652baa2 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -92,3 +92,10 @@ const ( SYNC_FILE_RANGE_WRITE = 2 SYNC_FILE_RANGE_WAIT_AFTER = 4 ) + +// Flag argument to renameat2(2), from include/uapi/linux/fs.h. +const ( + RENAME_NOREPLACE = (1 << 0) // Don't overwrite target. + RENAME_EXCHANGE = (1 << 1) // Exchange src and dst. + RENAME_WHITEOUT = (1 << 2) // Whiteout src. +) diff --git a/pkg/abi/linux/rseq.go b/pkg/abi/linux/rseq.go new file mode 100644 index 000000000..76253ba30 --- /dev/null +++ b/pkg/abi/linux/rseq.go @@ -0,0 +1,130 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// Flags passed to rseq(2). +// +// Defined in include/uapi/linux/rseq.h. +const ( + // RSEQ_FLAG_UNREGISTER unregisters the current thread. + RSEQ_FLAG_UNREGISTER = 1 << 0 +) + +// Critical section flags used in RSeqCriticalSection.Flags and RSeq.Flags. +// +// Defined in include/uapi/linux/rseq.h. +const ( + // RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT inhibits restart on preemption. + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = 1 << 0 + + // RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL inhibits restart on signal + // delivery. + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = 1 << 1 + + // RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE inhibits restart on CPU + // migration. + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = 1 << 2 +) + +// RSeqCriticalSection describes a restartable sequences critical section. It +// is equivalent to struct rseq_cs, defined in include/uapi/linux/rseq.h. +// +// In userspace, this structure is always aligned to 32 bytes. +// +// +marshal +type RSeqCriticalSection struct { + // Version is the version of this structure. Version 0 is defined here. + Version uint32 + + // Flags are the critical section flags, defined above. + Flags uint32 + + // Start is the start address of the critical section. + Start uint64 + + // PostCommitOffset is the offset from Start of the first instruction + // outside of the critical section. + PostCommitOffset uint64 + + // Abort is the abort address. It must be outside the critical section, + // and the 4 bytes prior must match the abort signature. + Abort uint64 +} + +const ( + // SizeOfRSeqCriticalSection is the size of RSeqCriticalSection. + SizeOfRSeqCriticalSection = 32 + + // SizeOfRSeqSignature is the size of the signature immediately + // preceding RSeqCriticalSection.Abort. + SizeOfRSeqSignature = 4 +) + +// Special values for RSeq.CPUID, defined in include/uapi/linux/rseq.h. +const ( + // RSEQ_CPU_ID_UNINITIALIZED indicates that this thread has not + // performed rseq initialization. + RSEQ_CPU_ID_UNINITIALIZED = ^uint32(0) // -1 + + // RSEQ_CPU_ID_REGISTRATION_FAILED indicates that rseq initialization + // failed. + RSEQ_CPU_ID_REGISTRATION_FAILED = ^uint32(1) // -2 +) + +// RSeq is the thread-local restartable sequences config/status. It +// is equivalent to struct rseq, defined in include/uapi/linux/rseq.h. +// +// In userspace, this structure is always aligned to 32 bytes. +type RSeq struct { + // CPUIDStart contains the current CPU ID if rseq is initialized. + // + // This field should only be read by the thread which registered this + // structure, and must be read atomically. + CPUIDStart uint32 + + // CPUID contains the current CPU ID or one of the CPU ID special + // values defined above. + // + // This field should only be read by the thread which registered this + // structure, and must be read atomically. + CPUID uint32 + + // RSeqCriticalSection is a pointer to the current RSeqCriticalSection + // block, or NULL. It is reset to NULL by the kernel on restart or + // non-restarting preempt/signal. + // + // This field should only be written by the thread which registered + // this structure, and must be written atomically. + RSeqCriticalSection uint64 + + // Flags are the critical section flags that apply to all critical + // sections on this thread, defined above. + Flags uint32 +} + +const ( + // SizeOfRSeq is the size of RSeq. + // + // Note that RSeq is naively 24 bytes. However, it has 32-byte + // alignment, which in C increases sizeof to 32. That is the size that + // the Linux kernel uses. + SizeOfRSeq = 32 + + // AlignOfRSeq is the standard alignment of RSeq. + AlignOfRSeq = 32 + + // OffsetOfRSeqCriticalSection is the offset of RSeqCriticalSection in RSeq. + OffsetOfRSeqCriticalSection = 8 +) diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go index 2e2cc6be7..766ee4014 100644 --- a/pkg/abi/linux/socket.go +++ b/pkg/abi/linux/socket.go @@ -422,6 +422,15 @@ type ControlMessageRights []int32 // ControlMessageRights. const SizeOfControlMessageRight = 4 +// SizeOfControlMessageInq is the size of a TCP_INQ control message. +const SizeOfControlMessageInq = 4 + +// SizeOfControlMessageTOS is the size of an IP_TOS control message. +const SizeOfControlMessageTOS = 1 + +// SizeOfControlMessageTClass is the size of an IPV6_TCLASS control message. +const SizeOfControlMessageTClass = 4 + // SCM_MAX_FD is the maximum number of FDs accepted in a single sendmsg call. // From net/scm.h. const SCM_MAX_FD = 253 diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/abi/linux/xattr.go index c36c4aff5..a3b6406fa 100644 --- a/pkg/sentry/fsimpl/proc/filesystems.go +++ b/pkg/abi/linux/xattr.go @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -package proc +package linux -// filesystemsData implements vfs.DynamicBytesSource for /proc/filesystems. -// -// +stateify savable -type filesystemsData struct{} +// Constants for extended attributes. +const ( + XATTR_NAME_MAX = 255 + XATTR_SIZE_MAX = 65536 + + XATTR_CREATE = 1 + XATTR_REPLACE = 2 -// TODO(b/138862512): Implement vfs.DynamicBytesSource.Generate for -// filesystemsData. We would need to retrive filesystem names from -// vfs.VirtualFilesystem. Also needs vfs replacement for -// fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev. + XATTR_USER_PREFIX = "user." + XATTR_USER_PREFIX_LEN = len(XATTR_USER_PREFIX) +) diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD index 0c5f50397..ca540363c 100644 --- a/pkg/fspath/BUILD +++ b/pkg/fspath/BUILD @@ -14,7 +14,6 @@ go_library( "fspath.go", ], importpath = "gvisor.dev/gvisor/pkg/fspath", - deps = ["//pkg/syserror"], ) go_test( @@ -25,5 +24,4 @@ go_test( "fspath_test.go", ], embed = [":fspath"], - deps = ["//pkg/syserror"], ) diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go index f68752560..9fb3fee24 100644 --- a/pkg/fspath/fspath.go +++ b/pkg/fspath/fspath.go @@ -18,19 +18,17 @@ package fspath import ( "strings" - - "gvisor.dev/gvisor/pkg/syserror" ) const pathSep = '/' -// Parse parses a pathname as described by path_resolution(7). -func Parse(pathname string) (Path, error) { +// Parse parses a pathname as described by path_resolution(7), except that +// empty pathnames will be parsed successfully to a Path for which +// Path.Absolute == Path.Dir == Path.HasComponents() == false. (This is +// necessary to support AT_EMPTY_PATH.) +func Parse(pathname string) Path { if len(pathname) == 0 { - // "... POSIX decrees that an empty pathname must not be resolved - // successfully. Linux returns ENOENT in this case." - - // path_resolution(7) - return Path{}, syserror.ENOENT + return Path{} } // Skip leading path separators. i := 0 @@ -41,7 +39,7 @@ func Parse(pathname string) (Path, error) { return Path{ Absolute: true, Dir: true, - }, nil + } } } // Skip trailing path separators. This is required by Iterator.Next. This @@ -64,7 +62,7 @@ func Parse(pathname string) (Path, error) { }, Absolute: i != 0, Dir: j != len(pathname)-1, - }, nil + } } // Path contains the information contained in a pathname string. @@ -111,6 +109,12 @@ func (p Path) String() string { return b.String() } +// HasComponents returns true if p contains a non-zero number of path +// components. +func (p Path) HasComponents() bool { + return p.Begin.Ok() +} + // An Iterator represents either a path component in a Path or a terminal // iterator indicating that the end of the path has been reached. // diff --git a/pkg/fspath/fspath_test.go b/pkg/fspath/fspath_test.go index 215b35622..d5e9a549a 100644 --- a/pkg/fspath/fspath_test.go +++ b/pkg/fspath/fspath_test.go @@ -18,15 +18,10 @@ import ( "reflect" "strings" "testing" - - "gvisor.dev/gvisor/pkg/syserror" ) func TestParseIteratorPartialPathnames(t *testing.T) { - path, err := Parse("/foo//bar///baz////") - if err != nil { - t.Fatalf("Parse failed: %v", err) - } + path := Parse("/foo//bar///baz////") // Parse strips leading slashes, and records their presence as // Path.Absolute. if !path.Absolute { @@ -71,6 +66,12 @@ func TestParse(t *testing.T) { } tests := []testCase{ { + pathname: "", + relpath: []string{}, + abs: false, + dir: false, + }, + { pathname: "/", relpath: []string{}, abs: true, @@ -113,10 +114,7 @@ func TestParse(t *testing.T) { for _, test := range tests { t.Run(test.pathname, func(t *testing.T) { - p, err := Parse(test.pathname) - if err != nil { - t.Fatalf("failed to parse pathname %q: %v", test.pathname, err) - } + p := Parse(test.pathname) t.Logf("pathname %q => path %q", test.pathname, p) if p.Absolute != test.abs { t.Errorf("path absoluteness: got %v, wanted %v", p.Absolute, test.abs) @@ -134,10 +132,3 @@ func TestParse(t *testing.T) { }) } } - -func TestParseEmptyPathname(t *testing.T) { - p, err := Parse("") - if err != syserror.ENOENT { - t.Errorf("parsing empty pathname: got (%v, %v), wanted (<unspecified>, ENOENT)", p, err) - } -} diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go index 51869c7d6..b9582c07f 100644 --- a/pkg/p9/handlers.go +++ b/pkg/p9/handlers.go @@ -257,7 +257,6 @@ func CanOpen(mode FileMode) bool { // handle implements handler.handle. func (t *Tlopen) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -294,7 +293,6 @@ func (t *Tlopen) handle(cs *connState) message { return syscall.EINVAL } - // Do the open. osFile, qid, ioUnit, err = ref.file.Open(t.Flags) return err }); err != nil { @@ -311,12 +309,10 @@ func (t *Tlopen) handle(cs *connState) message { } func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) { - // Don't allow complex names. if err := checkSafeName(t.Name); err != nil { return nil, err } - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return nil, syscall.EBADF @@ -390,12 +386,10 @@ func (t *Tsymlink) handle(cs *connState) message { } func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) { - // Don't allow complex names. if err := checkSafeName(t.Name); err != nil { return nil, err } - // Lookup the FID. ref, ok := cs.LookupFID(t.Directory) if !ok { return nil, syscall.EBADF @@ -426,19 +420,16 @@ func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) { // handle implements handler.handle. func (t *Tlink) handle(cs *connState) message { - // Don't allow complex names. if err := checkSafeName(t.Name); err != nil { return newErr(err) } - // Lookup the FID. ref, ok := cs.LookupFID(t.Directory) if !ok { return newErr(syscall.EBADF) } defer ref.DecRef() - // Lookup the other FID. refTarget, ok := cs.LookupFID(t.Target) if !ok { return newErr(syscall.EBADF) @@ -467,7 +458,6 @@ func (t *Tlink) handle(cs *connState) message { // handle implements handler.handle. func (t *Trenameat) handle(cs *connState) message { - // Don't allow complex names. if err := checkSafeName(t.OldName); err != nil { return newErr(err) } @@ -475,14 +465,12 @@ func (t *Trenameat) handle(cs *connState) message { return newErr(err) } - // Lookup the FID. ref, ok := cs.LookupFID(t.OldDirectory) if !ok { return newErr(syscall.EBADF) } defer ref.DecRef() - // Lookup the other FID. refTarget, ok := cs.LookupFID(t.NewDirectory) if !ok { return newErr(syscall.EBADF) @@ -523,12 +511,10 @@ func (t *Trenameat) handle(cs *connState) message { // handle implements handler.handle. func (t *Tunlinkat) handle(cs *connState) message { - // Don't allow complex names. if err := checkSafeName(t.Name); err != nil { return newErr(err) } - // Lookup the FID. ref, ok := cs.LookupFID(t.Directory) if !ok { return newErr(syscall.EBADF) @@ -577,19 +563,16 @@ func (t *Tunlinkat) handle(cs *connState) message { // handle implements handler.handle. func (t *Trename) handle(cs *connState) message { - // Don't allow complex names. if err := checkSafeName(t.Name); err != nil { return newErr(err) } - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) } defer ref.DecRef() - // Lookup the target. refTarget, ok := cs.LookupFID(t.Directory) if !ok { return newErr(syscall.EBADF) @@ -641,7 +624,6 @@ func (t *Trename) handle(cs *connState) message { // handle implements handler.handle. func (t *Treadlink) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -669,7 +651,6 @@ func (t *Treadlink) handle(cs *connState) message { // handle implements handler.handle. func (t *Tread) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -708,7 +689,6 @@ func (t *Tread) handle(cs *connState) message { // handle implements handler.handle. func (t *Twrite) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -747,12 +727,10 @@ func (t *Tmknod) handle(cs *connState) message { } func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) { - // Don't allow complex names. if err := checkSafeName(t.Name); err != nil { return nil, err } - // Lookup the FID. ref, ok := cs.LookupFID(t.Directory) if !ok { return nil, syscall.EBADF @@ -791,12 +769,10 @@ func (t *Tmkdir) handle(cs *connState) message { } func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) { - // Don't allow complex names. if err := checkSafeName(t.Name); err != nil { return nil, err } - // Lookup the FID. ref, ok := cs.LookupFID(t.Directory) if !ok { return nil, syscall.EBADF @@ -827,7 +803,6 @@ func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) { // handle implements handler.handle. func (t *Tgetattr) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -856,7 +831,6 @@ func (t *Tgetattr) handle(cs *connState) message { // handle implements handler.handle. func (t *Tsetattr) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -883,7 +857,6 @@ func (t *Tsetattr) handle(cs *connState) message { // handle implements handler.handle. func (t *Tallocate) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -917,7 +890,6 @@ func (t *Tallocate) handle(cs *connState) message { // handle implements handler.handle. func (t *Txattrwalk) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -930,7 +902,6 @@ func (t *Txattrwalk) handle(cs *connState) message { // handle implements handler.handle. func (t *Txattrcreate) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -943,7 +914,6 @@ func (t *Txattrcreate) handle(cs *connState) message { // handle implements handler.handle. func (t *Treaddir) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.Directory) if !ok { return newErr(syscall.EBADF) @@ -977,7 +947,6 @@ func (t *Treaddir) handle(cs *connState) message { // handle implements handler.handle. func (t *Tfsync) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -1001,7 +970,6 @@ func (t *Tfsync) handle(cs *connState) message { // handle implements handler.handle. func (t *Tstatfs) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -1192,7 +1160,6 @@ func doWalk(cs *connState, ref *fidRef, names []string, getattr bool) (qids []QI // handle implements handler.handle. func (t *Twalk) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -1213,7 +1180,6 @@ func (t *Twalk) handle(cs *connState) message { // handle implements handler.handle. func (t *Twalkgetattr) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -1270,7 +1236,6 @@ func (t *Tumknod) handle(cs *connState) message { // handle implements handler.handle. func (t *Tlconnect) handle(cs *connState) message { - // Lookup the FID. ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(syscall.EBADF) @@ -1303,7 +1268,6 @@ func (t *Tchannel) handle(cs *connState) message { return newErr(err) } - // Lookup the given channel. ch := cs.lookupChannel(t.ID) if ch == nil { return newErr(syscall.ENOSYS) diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go index 498ca4669..81ec98a77 100644 --- a/pkg/sentry/arch/arch.go +++ b/pkg/sentry/arch/arch.go @@ -125,9 +125,9 @@ type Context interface { // SetTLS sets the current TLS pointer. Returns false if value is invalid. SetTLS(value uintptr) bool - // SetRSEQInterruptedIP sets the register that contains the old IP when a - // restartable sequence is interrupted. - SetRSEQInterruptedIP(value uintptr) + // SetOldRSeqInterruptedIP sets the register that contains the old IP + // when an "old rseq" restartable sequence is interrupted. + SetOldRSeqInterruptedIP(value uintptr) // StateData returns a pointer to underlying architecture state. StateData() *State diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index 9e7db8b30..2aa08b1a9 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -174,8 +174,8 @@ func (c *context64) SetTLS(value uintptr) bool { return true } -// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP. -func (c *context64) SetRSEQInterruptedIP(value uintptr) { +// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP. +func (c *context64) SetOldRSeqInterruptedIP(value uintptr) { c.Regs.R10 = uint64(value) } @@ -305,7 +305,7 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) { buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs()) return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil } - // TODO(b/34088053): debug registers + // Note: x86 debug registers are missing. return c.Native(0), nil } @@ -320,6 +320,6 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error { _, err := c.PtraceSetRegs(bytes.NewBuffer(buf)) return err } - // TODO(b/34088053): debug registers + // Note: x86 debug registers are missing. return nil } diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go index 1f78d54a2..e1f2fea60 100644 --- a/pkg/sentry/control/pprof.go +++ b/pkg/sentry/control/pprof.go @@ -22,6 +22,7 @@ import ( "sync" "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/urpc" ) @@ -56,6 +57,9 @@ type Profile struct { // traceFile is the current execution trace output file. traceFile *fd.FD + + // Kernel is the kernel under profile. + Kernel *kernel.Kernel } // StartCPUProfile is an RPC stub which starts recording the CPU profile in a @@ -147,6 +151,9 @@ func (p *Profile) StartTrace(o *ProfileOpts, _ *struct{}) error { return err } + // Ensure all trace contexts are registered. + p.Kernel.RebuildTraceContexts() + p.traceFile = output return nil } @@ -158,9 +165,15 @@ func (p *Profile) StopTrace(_, _ *struct{}) error { defer p.mu.Unlock() if p.traceFile == nil { - return errors.New("Execution tracing not start") + return errors.New("Execution tracing not started") } + // Similarly to the case above, if tasks have not ended traces, we will + // lose information. Thus we need to rebuild the tasks in order to have + // complete information. This will not lose information if multiple + // traces are overlapping. + p.Kernel.RebuildTraceContexts() + trace.Stop() p.traceFile.Close() p.traceFile = nil diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index c35faeb4c..ced51c66c 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -268,14 +268,17 @@ func (proc *Proc) Ps(args *PsArgs, out *string) error { } // Process contains information about a single process in a Sandbox. -// TODO(b/117881927): Implement TTY field. type Process struct { UID auth.KUID `json:"uid"` PID kernel.ThreadID `json:"pid"` // Parent PID - PPID kernel.ThreadID `json:"ppid"` + PPID kernel.ThreadID `json:"ppid"` + Threads []kernel.ThreadID `json:"threads"` // Processor utilization C int32 `json:"c"` + // TTY name of the process. Will be of the form "pts/N" if there is a + // TTY, or "?" if there is not. + TTY string `json:"tty"` // Start time STime string `json:"stime"` // CPU time @@ -285,18 +288,19 @@ type Process struct { } // ProcessListToTable prints a table with the following format: -// UID PID PPID C STIME TIME CMD -// 0 1 0 0 14:04 505262ns tail +// UID PID PPID C TTY STIME TIME CMD +// 0 1 0 0 pty/4 14:04 505262ns tail func ProcessListToTable(pl []*Process) string { var buf bytes.Buffer tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0) - fmt.Fprint(tw, "UID\tPID\tPPID\tC\tSTIME\tTIME\tCMD") + fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD") for _, d := range pl { - fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s", + fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s", d.UID, d.PID, d.PPID, d.C, + d.TTY, d.STime, d.Time, d.Cmd) @@ -307,7 +311,7 @@ func ProcessListToTable(pl []*Process) string { // ProcessListToJSON will return the JSON representation of ps. func ProcessListToJSON(pl []*Process) (string, error) { - b, err := json.Marshal(pl) + b, err := json.MarshalIndent(pl, "", " ") if err != nil { return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err) } @@ -334,7 +338,9 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { ts := k.TaskSet() now := k.RealtimeClock().Now() for _, tg := range ts.Root.ThreadGroups() { - pid := tg.PIDNamespace().IDOfThreadGroup(tg) + pidns := tg.PIDNamespace() + pid := pidns.IDOfThreadGroup(tg) + // If tg has already been reaped ignore it. if pid == 0 { continue @@ -345,16 +351,19 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { ppid := kernel.ThreadID(0) if p := tg.Leader().Parent(); p != nil { - ppid = p.PIDNamespace().IDOfThreadGroup(p.ThreadGroup()) + ppid = pidns.IDOfThreadGroup(p.ThreadGroup()) } + threads := tg.MemberIDs(pidns) *out = append(*out, &Process{ - UID: tg.Leader().Credentials().EffectiveKUID, - PID: pid, - PPID: ppid, - STime: formatStartTime(now, tg.Leader().StartTime()), - C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now), - Time: tg.CPUStats().SysTime.String(), - Cmd: tg.Leader().Name(), + UID: tg.Leader().Credentials().EffectiveKUID, + PID: pid, + PPID: ppid, + Threads: threads, + STime: formatStartTime(now, tg.Leader().StartTime()), + C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now), + Time: tg.CPUStats().SysTime.String(), + Cmd: tg.Leader().Name(), + TTY: ttyName(tg.TTY()), }) } sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID }) @@ -395,3 +404,10 @@ func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 { } return int32(percentCPU) } + +func ttyName(tty *kernel.TTY) string { + if tty == nil { + return "?" + } + return fmt.Sprintf("pts/%d", tty.Index) +} diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go index d8ada2694..0a88459b2 100644 --- a/pkg/sentry/control/proc_test.go +++ b/pkg/sentry/control/proc_test.go @@ -34,7 +34,7 @@ func TestProcessListTable(t *testing.T) { }{ { pl: []*Process{}, - expected: "UID PID PPID C STIME TIME CMD", + expected: "UID PID PPID C TTY STIME TIME CMD", }, { pl: []*Process{ @@ -43,6 +43,7 @@ func TestProcessListTable(t *testing.T) { PID: 0, PPID: 0, C: 0, + TTY: "?", STime: "0", Time: "0", Cmd: "zero", @@ -52,14 +53,15 @@ func TestProcessListTable(t *testing.T) { PID: 1, PPID: 1, C: 1, + TTY: "pts/4", STime: "1", Time: "1", Cmd: "one", }, }, - expected: `UID PID PPID C STIME TIME CMD -0 0 0 0 0 0 zero -1 1 1 1 1 1 one`, + expected: `UID PID PPID C TTY STIME TIME CMD +0 0 0 0 ? 0 0 zero +1 1 1 1 pts/4 1 1 one`, }, } diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index c0a6e884b..a2f966cb6 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -555,6 +555,10 @@ type lockedWriter struct { // // This applies only to Write, not WriteAt. Offset int64 + + // Err contains the first error encountered while copying. This is + // useful to determine whether Writer or Reader failed during io.Copy. + Err error } // Write implements io.Writer.Write. @@ -590,5 +594,8 @@ func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) { break } } + if w.Err == nil { + w.Err = err + } return written, err } diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index b2e8d9c77..9ca695a95 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -53,7 +53,7 @@ go_template_instance( "Key": "uint64", "Range": "memmap.MappableRange", "Value": "uint64", - "Functions": "fileRangeSetFunctions", + "Functions": "FileRangeSetFunctions", }, ) diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go index 0a5466b0a..f52d712e3 100644 --- a/pkg/sentry/fs/fsutil/file_range_set.go +++ b/pkg/sentry/fs/fsutil/file_range_set.go @@ -34,25 +34,25 @@ import ( // // type FileRangeSet <generated by go_generics> -// fileRangeSetFunctions implements segment.Functions for FileRangeSet. -type fileRangeSetFunctions struct{} +// FileRangeSetFunctions implements segment.Functions for FileRangeSet. +type FileRangeSetFunctions struct{} // MinKey implements segment.Functions.MinKey. -func (fileRangeSetFunctions) MinKey() uint64 { +func (FileRangeSetFunctions) MinKey() uint64 { return 0 } // MaxKey implements segment.Functions.MaxKey. -func (fileRangeSetFunctions) MaxKey() uint64 { +func (FileRangeSetFunctions) MaxKey() uint64 { return math.MaxUint64 } // ClearValue implements segment.Functions.ClearValue. -func (fileRangeSetFunctions) ClearValue(_ *uint64) { +func (FileRangeSetFunctions) ClearValue(_ *uint64) { } // Merge implements segment.Functions.Merge. -func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) { +func (FileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) { if frstart1+mr1.Length() != frstart2 { return 0, false } @@ -60,7 +60,7 @@ func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ } // Split implements segment.Functions.Split. -func (fileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) { +func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) { return frstart, frstart + (split - mr.Start) } diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index 8c17603f8..c09f3b71c 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -234,6 +234,8 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, if err != nil { return nil, err } + // We're not going to use newFile after return. + defer newFile.close(ctx) // Stabilize the endpoint map while creation is in progress. unlock := i.session().endpoints.lock() @@ -254,7 +256,6 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, // Get the attributes of the file to create inode key. qid, mask, attr, err := getattr(ctx, newFile) if err != nil { - newFile.close(ctx) return nil, err } @@ -270,7 +271,6 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, // cloned and re-opened multiple times after creation. _, unopened, err := i.fileState.file.walk(ctx, []string{name}) if err != nil { - newFile.close(ctx) return nil, err } diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 0da608548..4e358a46a 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -143,9 +143,9 @@ type session struct { // socket files. This allows unix domain sockets to be used with paths that // belong to a gofer. // - // TODO(b/77154739): there are few possible races with someone stat'ing the - // file and another deleting it concurrently, where the file will not be - // reported as socket file. + // TODO(gvisor.dev/issue/1200): there are few possible races with someone + // stat'ing the file and another deleting it concurrently, where the file + // will not be reported as socket file. endpoints *endpointMaps `state:"wait"` } diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index 2d43dff1d..91e2fde2f 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -270,6 +270,14 @@ func (i *Inode) Getxattr(name string) (string, error) { return i.InodeOperations.Getxattr(i, name) } +// Setxattr calls i.InodeOperations.Setxattr with i as the Inode. +func (i *Inode) Setxattr(name, value string) error { + if i.overlay != nil { + return overlaySetxattr(i.overlay, name, value) + } + return i.InodeOperations.Setxattr(i, name, value) +} + // Listxattr calls i.InodeOperations.Listxattr with i as the Inode. func (i *Inode) Listxattr() (map[string]struct{}, error) { if i.overlay != nil { diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index a09147080..13d11e001 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -436,7 +436,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena } func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) { - if err := copyUp(ctx, parent); err != nil { + if err := copyUpLockedForRename(ctx, parent); err != nil { return nil, err } @@ -552,6 +552,11 @@ func overlayGetxattr(o *overlayEntry, name string) (string, error) { return s, err } +// TODO(b/146028302): Support setxattr for overlayfs. +func overlaySetxattr(o *overlayEntry, name, value string) error { + return syserror.EOPNOTSUPP +} + func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) { o.copyMu.RLock() defer o.copyMu.RUnlock() diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index f3b63dfc2..bd93f83fa 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -64,7 +64,7 @@ var _ fs.InodeOperations = (*tcpMemInode)(nil) func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir tcpMemDir) *fs.Inode { tm := &tcpMemInode{ - SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), s: s, dir: dir, } @@ -77,6 +77,11 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir return fs.NewInode(ctx, tm, msrc, sattr) } +// Truncate implements fs.InodeOperations.Truncate. +func (tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + // GetFile implements fs.InodeOperations.GetFile. func (m *tcpMemInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { flags.Pread = true @@ -168,14 +173,15 @@ func writeSize(dirType tcpMemDir, s inet.Stack, size inet.TCPBufferSize) error { // +stateify savable type tcpSack struct { + fsutil.SimpleFileInode + stack inet.Stack `state:"wait"` enabled *bool - fsutil.SimpleFileInode } func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { ts := &tcpSack{ - SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), stack: s, } sattr := fs.StableAttr{ @@ -187,6 +193,11 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f return fs.NewInode(ctx, ts, msrc, sattr) } +// Truncate implements fs.InodeOperations.Truncate. +func (tcpSack) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + // GetFile implements fs.InodeOperations.GetFile. func (s *tcpSack) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { flags.Pread = true diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 87184ec67..9bf4b4527 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -67,29 +67,28 @@ type taskDir struct { var _ fs.InodeOperations = (*taskDir)(nil) // newTaskDir creates a new proc task entry. -func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool) *fs.Inode { +func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { contents := map[string]*fs.Inode{ - "auxv": newAuxvec(t, msrc), - "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), - "comm": newComm(t, msrc), - "environ": newExecArgInode(t, msrc, environExecArg), - "exe": newExe(t, msrc), - "fd": newFdDir(t, msrc), - "fdinfo": newFdInfoDir(t, msrc), - "gid_map": newGIDMap(t, msrc), - // FIXME(b/123511468): create the correct io file for threads. - "io": newIO(t, msrc), + "auxv": newAuxvec(t, msrc), + "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), + "comm": newComm(t, msrc), + "environ": newExecArgInode(t, msrc, environExecArg), + "exe": newExe(t, msrc), + "fd": newFdDir(t, msrc), + "fdinfo": newFdInfoDir(t, msrc), + "gid_map": newGIDMap(t, msrc), + "io": newIO(t, msrc, isThreadGroup), "maps": newMaps(t, msrc), "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), "ns": newNamespaceDir(t, msrc), "smaps": newSmaps(t, msrc), - "stat": newTaskStat(t, msrc, showSubtasks, p.pidns), + "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns), "statm": newStatm(t, msrc), "status": newStatus(t, msrc, p.pidns), "uid_map": newUIDMap(t, msrc), } - if showSubtasks { + if isThreadGroup { contents["task"] = p.newSubtasks(t, msrc) } if len(p.cgroupControllers) > 0 { @@ -605,6 +604,10 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ( fmt.Fprintf(&buf, "CapEff:\t%016x\n", creds.EffectiveCaps) fmt.Fprintf(&buf, "CapBnd:\t%016x\n", creds.BoundingCaps) fmt.Fprintf(&buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + // We unconditionally report a single NUMA node. See + // pkg/sentry/syscalls/linux/sys_mempolicy.go. + fmt.Fprintf(&buf, "Mems_allowed:\t1\n") + fmt.Fprintf(&buf, "Mems_allowed_list:\t0\n") return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*statusData)(nil)}}, 0 } @@ -619,8 +622,11 @@ type ioData struct { ioUsage } -func newIO(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { - return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t) +func newIO(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { + if isThreadGroup { + return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t) + } + return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t}), msrc, fs.SpecialFile, t) } // NeedsUpdate returns whether the generation is old or not. @@ -639,7 +645,7 @@ func (i *ioData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se io.Accumulate(i.IOUsage()) var buf bytes.Buffer - fmt.Fprintf(&buf, "char: %d\n", io.CharsRead) + fmt.Fprintf(&buf, "rchar: %d\n", io.CharsRead) fmt.Fprintf(&buf, "wchar: %d\n", io.CharsWritten) fmt.Fprintf(&buf, "syscr: %d\n", io.ReadSyscalls) fmt.Fprintf(&buf, "syscw: %d\n", io.WriteSyscalls) diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go index 311798811..389c330a0 100644 --- a/pkg/sentry/fs/splice.go +++ b/pkg/sentry/fs/splice.go @@ -167,6 +167,11 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, if !srcPipe && !opts.SrcOffset { atomic.StoreInt64(&src.offset, src.offset+n) } + + // Don't report any errors if we have some progress without data loss. + if w.Err == nil { + err = nil + } } // Drop locks. diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index bc56be696..6b07f6bf2 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -32,7 +32,6 @@ import ( // +stateify savable type masterInodeOperations struct { fsutil.SimpleFileInode - fsutil.InodeNoopTruncate // d is the containing dir. d *dirInodeOperations @@ -77,6 +76,11 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn func (mi *masterInodeOperations) Release(ctx context.Context) { } +// Truncate implements fs.InodeOperations.Truncate. +func (*masterInodeOperations) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + // GetFile implements fs.InodeOperations.GetFile. // // It allocates a new terminal. diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go index 4cbea0367..2a51e6bab 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/slave.go @@ -31,7 +31,6 @@ import ( // +stateify savable type slaveInodeOperations struct { fsutil.SimpleFileInode - fsutil.InodeNoopTruncate // d is the containing dir. d *dirInodeOperations @@ -73,6 +72,11 @@ func (si *slaveInodeOperations) Release(ctx context.Context) { si.t.DecRef() } +// Truncate implements fs.InodeOperations.Truncate. +func (slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + // GetFile implements fs.InodeOperations.GetFile. // // This may race with destruction of the terminal. If the terminal is gone, it diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go index ff8138820..917f90cc0 100644 --- a/pkg/sentry/fs/tty/terminal.go +++ b/pkg/sentry/fs/tty/terminal.go @@ -53,8 +53,8 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal d: d, n: n, ld: newLineDiscipline(termios), - masterKTTY: &kernel.TTY{}, - slaveKTTY: &kernel.TTY{}, + masterKTTY: &kernel.TTY{Index: n}, + slaveKTTY: &kernel.TTY{Index: n}, } t.EnableLeakCheck("tty.Terminal") return &t diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index 7ccff8b0d..bc90330bc 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -38,6 +38,7 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/fd", + "//pkg/fspath", "//pkg/log", "//pkg/sentry/arch", "//pkg/sentry/context", @@ -73,6 +74,7 @@ go_test( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/fspath", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", "//pkg/sentry/fsimpl/ext/disklayout", diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD index bfc46dfa6..4fc8296ef 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/BUILD +++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD @@ -7,6 +7,7 @@ go_test( size = "small", srcs = ["benchmark_test.go"], deps = [ + "//pkg/fspath", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", "//pkg/sentry/fsimpl/ext", diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 94cd74095..a56b03711 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -24,6 +24,7 @@ import ( "strings" "testing" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext" @@ -49,7 +50,9 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}) + vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() @@ -81,7 +84,11 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf ctx := contexttest.Context(b) creds := auth.CredentialsFromContext(ctx) - if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}); err != nil { + if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: int(f.Fd()), + }, + }); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) } return func() { @@ -117,7 +124,7 @@ func BenchmarkVFS2Ext4fsStat(b *testing.B) { stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{ Root: *root, Start: *root, - Pathname: filePath, + Path: fspath.Parse(filePath), FollowFinalSymlink: true, }, &vfs.StatOptions{}) if err != nil { @@ -146,9 +153,9 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) { creds := auth.CredentialsFromContext(ctx) mountPointName := "/1/" pop := vfs.PathOperation{ - Root: *root, - Start: *root, - Pathname: mountPointName, + Root: *root, + Start: *root, + Path: fspath.Parse(mountPointName), } // Save the mount point for later use. @@ -177,7 +184,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) { stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{ Root: *root, Start: *root, - Pathname: filePath, + Path: fspath.Parse(filePath), FollowFinalSymlink: true, }, &vfs.StatOptions{}) if err != nil { diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 307e4d68c..6c14a1e2d 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -25,6 +25,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" @@ -65,7 +66,9 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}) + vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() @@ -140,62 +143,61 @@ func TestSeek(t *testing.T) { fd, err := vfsfs.OpenAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path}, + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, &vfs.OpenOptions{}, ) if err != nil { t.Fatalf("vfsfs.OpenAt failed: %v", err) } - if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil { + if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil { t.Errorf("expected seek position 0, got %d and error %v", n, err) } - stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{}) + stat, err := fd.Stat(ctx, vfs.StatOptions{}) if err != nil { t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err) } // We should be able to seek beyond the end of file. size := int64(stat.Size) - if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil { + if n, err := fd.Seek(ctx, size, linux.SEEK_SET); n != size || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size, n, err) } // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL { + if _, err := fd.Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL { t.Errorf("expected error EINVAL but got %v", err) } - if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil { + if n, err := fd.Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err) } // Make sure negative offsets work with SEEK_CUR. - if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil { + if n, err := fd.Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) } // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL { + if _, err := fd.Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL { t.Errorf("expected error EINVAL but got %v", err) } // Make sure SEEK_END works with regular files. - switch fd.Impl().(type) { - case *regularFileFD: + if _, ok := fd.Impl().(*regularFileFD); ok { // Seek back to 0. - if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil { + if n, err := fd.Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", 0, n, err) } // Seek forward beyond EOF. - if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil { + if n, err := fd.Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) } // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL { + if _, err := fd.Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL { t.Errorf("expected error EINVAL but got %v", err) } } @@ -360,7 +362,7 @@ func TestStatAt(t *testing.T) { got, err := vfsfs.StatAt(ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path}, + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, &vfs.StatOptions{}, ) if err != nil { @@ -430,7 +432,7 @@ func TestRead(t *testing.T) { fd, err := vfsfs.OpenAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath}, + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.absPath)}, &vfs.OpenOptions{}, ) if err != nil { @@ -456,7 +458,7 @@ func TestRead(t *testing.T) { want := make([]byte, 1) for { n, err := f.Read(want) - fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}) + fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}) if diff := cmp.Diff(got, want); diff != "" { t.Errorf("file data mismatch (-want +got):\n%s", diff) @@ -464,7 +466,7 @@ func TestRead(t *testing.T) { // Make sure there is no more file data left after getting EOF. if n == 0 || err == io.EOF { - if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 { + if n, _ := fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 { t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image) } @@ -566,7 +568,7 @@ func TestIterDirents(t *testing.T) { fd, err := vfsfs.OpenAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path}, + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, &vfs.OpenOptions{}, ) if err != nil { @@ -574,7 +576,7 @@ func TestIterDirents(t *testing.T) { } cb := &iterDirentsCb{} - if err = fd.Impl().IterDirents(ctx, cb); err != nil { + if err = fd.IterDirents(ctx, cb); err != nil { t.Fatalf("dir fd.IterDirents() failed: %v", err) } diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go index 5eca2b83f..841274daf 100644 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ b/pkg/sentry/fsimpl/ext/file_description.go @@ -26,13 +26,6 @@ import ( type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl - - // flags is the same as vfs.OpenOptions.Flags which are passed to - // vfs.FilesystemImpl.OpenAt. - // TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2), - // fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set. - // Only close(2), fstat(2), fstatfs(2) should work. - flags uint32 } func (fd *fileDescription) filesystem() *filesystem { @@ -43,18 +36,6 @@ func (fd *fileDescription) inode() *inode { return fd.vfsfd.Dentry().Impl().(*dentry).inode } -// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. -func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) { - return fd.flags, nil -} - -// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. -func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - // None of the flags settable by fcntl(F_SETFL) are supported, so this is a - // no-op. - return nil -} - // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 2d15e8aaf..616fc002a 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -20,6 +20,7 @@ import ( "sync" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -274,6 +275,16 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op return vfsd, nil } +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + vfsd, inode, err := fs.walk(rp, true) + if err != nil { + return nil, err + } + inode.incRef() + return vfsd, nil +} + // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { vfsd, inode, err := fs.walk(rp, false) @@ -377,7 +388,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v } // RenameAt implements vfs.FilesystemImpl.RenameAt. -func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { if rp.Done() { return syserror.ENOENT } @@ -441,3 +452,46 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EROFS } + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + _, _, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + _, _, err := fs.walk(rp, false) + if err != nil { + return "", err + } + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.ENOTSUP +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 24249525c..8608805bf 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -157,10 +157,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v switch in.impl.(type) { case *regularFile: var fd regularFileFD - fd.flags = flags - mnt.IncRef() - vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *directory: // Can't open directories writably. This check is not necessary for a read @@ -169,10 +166,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - fd.flags = flags - mnt.IncRef() - vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *symlink: if flags&linux.O_PATH == 0 { @@ -180,10 +174,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.ELOOP } var fd symlinkFD - fd.flags = flags - mnt.IncRef() - vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil default: panic(fmt.Sprintf("unknown inode type: %T", in.impl)) diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD new file mode 100644 index 000000000..39c03ee9d --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -0,0 +1,62 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "slot_list", + out = "slot_list.go", + package = "kernfs", + prefix = "slot", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*slot", + "Linker": "*slot", + }, +) + +go_library( + name = "kernfs", + srcs = [ + "dynamic_bytes_file.go", + "fd_impl_util.go", + "filesystem.go", + "inode_impl_util.go", + "kernfs.go", + "slot_list.go", + "symlink.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/log", + "//pkg/refs", + "//pkg/sentry/context", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) + +go_test( + name = "kernfs_test", + size = "small", + srcs = ["kernfs_test.go"], + deps = [ + ":kernfs", + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", + "@com_github_google_go-cmp//cmp:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go new file mode 100644 index 000000000..606ca692d --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -0,0 +1,127 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// DynamicBytesFile implements kernfs.Inode and represents a read-only +// file whose contents are backed by a vfs.DynamicBytesSource. +// +// Must be instantiated with NewDynamicBytesFile or initialized with Init +// before first use. +// +// +stateify savable +type DynamicBytesFile struct { + InodeAttrs + InodeNoopRefCount + InodeNotDirectory + InodeNotSymlink + + data vfs.DynamicBytesSource +} + +var _ Inode = (*DynamicBytesFile)(nil) + +// Init initializes a dynamic bytes file. +func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + f.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm) + f.data = data +} + +// Open implements Inode.Open. +func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &DynamicBytesFD{} + fd.Init(rp.Mount(), vfsd, f.data, flags) + return &fd.vfsfd, nil +} + +// SetStat implements Inode.SetStat. +func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { + // DynamicBytesFiles are immutable. + return syserror.EPERM +} + +// DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a +// DynamicBytesFile. +// +// Must be initialized with Init before first use. +// +// +stateify savable +type DynamicBytesFD struct { + vfs.FileDescriptionDefaultImpl + vfs.DynamicBytesFileDescriptionImpl + + vfsfd vfs.FileDescription + inode Inode +} + +// Init initializes a DynamicBytesFD. +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) { + fd.inode = d.Impl().(*Dentry).inode + fd.SetDataSource(data) + fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence) +} + +// Read implmenets vfs.FileDescriptionImpl.Read. +func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts) +} + +// PRead implmenets vfs.FileDescriptionImpl.PRead. +func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return fd.FileDescriptionDefaultImpl.Write(ctx, src, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.FileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *DynamicBytesFD) Release() {} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + return fd.inode.Stat(fs), nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { + // DynamicBytesFiles are immutable. + return syserror.EPERM +} diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go new file mode 100644 index 000000000..bcf069b5f --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -0,0 +1,194 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory +// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not +// compatible with dynamic directories. +// +// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling +// IterDirents callback. The IterDirents callback therefore cannot hash or +// unhash children, or recursively call IterDirents on the same underlying +// inode. +// +// Must be initialize with Init before first use. +type GenericDirectoryFD struct { + vfs.FileDescriptionDefaultImpl + vfs.DirectoryFileDescriptionDefaultImpl + + vfsfd vfs.FileDescription + children *OrderedChildren + off int64 +} + +// Init initializes a GenericDirectoryFD. +func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) { + fd.children = children + fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) +} + +// VFSFileDescription returns a pointer to the vfs.FileDescription representing +// this object. +func (fd *GenericDirectoryFD) VFSFileDescription() *vfs.FileDescription { + return &fd.vfsfd +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *GenericDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return fd.FileDescriptionDefaultImpl.ConfigureMMap(ctx, opts) +} + +// Read implmenets vfs.FileDescriptionImpl.Read. +func (fd *GenericDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.Read(ctx, dst, opts) +} + +// PRead implmenets vfs.FileDescriptionImpl.PRead. +func (fd *GenericDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.PRead(ctx, dst, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *GenericDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.Write(ctx, src, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) +} + +// Release implements vfs.FileDecriptionImpl.Release. +func (fd *GenericDirectoryFD) Release() {} + +func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem() +} + +func (fd *GenericDirectoryFD) inode() Inode { + return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode +} + +// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds +// o.mu when calling cb. +func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + vfsFS := fd.filesystem() + fs := vfsFS.Impl().(*Filesystem) + vfsd := fd.vfsfd.VirtualDentry().Dentry() + + fs.mu.Lock() + defer fs.mu.Unlock() + + // Handle ".". + if fd.off == 0 { + stat := fd.inode().Stat(vfsFS) + dirent := vfs.Dirent{ + Name: ".", + Type: linux.DT_DIR, + Ino: stat.Ino, + NextOff: 1, + } + if !cb.Handle(dirent) { + return nil + } + fd.off++ + } + + // Handle "..". + if fd.off == 1 { + parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode + stat := parentInode.Stat(vfsFS) + dirent := vfs.Dirent{ + Name: "..", + Type: linux.FileMode(stat.Mode).DirentType(), + Ino: stat.Ino, + NextOff: 2, + } + if !cb.Handle(dirent) { + return nil + } + fd.off++ + } + + // Handle static children. + fd.children.mu.RLock() + defer fd.children.mu.RUnlock() + // fd.off accounts for "." and "..", but fd.children do not track + // these. + childIdx := fd.off - 2 + for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { + inode := it.Dentry.Impl().(*Dentry).inode + stat := inode.Stat(vfsFS) + dirent := vfs.Dirent{ + Name: it.Name, + Type: linux.FileMode(stat.Mode).DirentType(), + Ino: stat.Ino, + NextOff: fd.off + 1, + } + if !cb.Handle(dirent) { + return nil + } + fd.off++ + } + + var err error + relOffset := fd.off - int64(len(fd.children.set)) - 2 + fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset) + return err +} + +// Seek implements vfs.FileDecriptionImpl.Seek. +func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fs := fd.filesystem().Impl().(*Filesystem) + fs.mu.Lock() + defer fs.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.filesystem() + inode := fd.inode() + return inode.Stat(fs), nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + fs := fd.filesystem() + inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode + return inode.SetStat(fs, opts) +} diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go new file mode 100644 index 000000000..79759e0fc --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -0,0 +1,793 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file implements vfs.FilesystemImpl for kernfs. + +package kernfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// stepExistingLocked resolves rp.Component() in parent directory vfsd. +// +// stepExistingLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// +// Postcondition: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) { + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, syserror.ENOTDIR + } + // Directory searchable? + if err := d.inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } +afterSymlink: + name := rp.Component() + // Revalidation must be skipped if name is "." or ".."; d or its parent + // respectively can't be expected to transition from invalidated back to + // valid, so detecting invalidation and retrying would loop forever. This + // is consistent with Linux: fs/namei.c:walk_component() => lookup_fast() + // calls d_revalidate(), but walk_component() => handle_dots() does not. + if name == "." { + rp.Advance() + return vfsd, nil + } + if name == ".." { + nextVFSD, err := rp.ResolveParent(vfsd) + if err != nil { + return nil, err + } + rp.Advance() + return nextVFSD, nil + } + d.dirMu.Lock() + nextVFSD, err := rp.ResolveChild(vfsd, name) + if err != nil { + d.dirMu.Unlock() + return nil, err + } + next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, nextVFSD) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + // Resolve any symlink at current path component. + if rp.ShouldFollowSymlink() && next.isSymlink() { + // TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks". + target, err := next.inode.Readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink + + } + rp.Advance() + return &next.vfsd, nil +} + +// revalidateChildLocked must be called after a call to parent.vfsd.Child(name) +// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be +// nil) to verify that the returned child (or lack thereof) is correct. +// +// Preconditions: Filesystem.mu must be locked for at least reading. +// parent.dirMu must be locked. parent.isDir(). name is not "." or "..". +// +// Postconditions: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, childVFSD *vfs.Dentry) (*Dentry, error) { + if childVFSD != nil { + // Cached dentry exists, revalidate. + child := childVFSD.Impl().(*Dentry) + if !child.inode.Valid(ctx) { + vfsObj.ForceDeleteDentry(childVFSD) + fs.deferDecRef(childVFSD) // Reference from Lookup. + childVFSD = nil + } + } + if childVFSD == nil { + // Dentry isn't cached; it either doesn't exist or failed + // revalidation. Attempt to resolve it via Lookup. + // + // FIXME(b/144498111): Inode.Lookup() should return *(kernfs.)Dentry, + // not *vfs.Dentry, since (kernfs.)Filesystem assumes that all dentries + // in the filesystem are (kernfs.)Dentry and performs vfs.DentryImpl + // casts accordingly. + var err error + childVFSD, err = parent.inode.Lookup(ctx, name) + if err != nil { + return nil, err + } + // Reference on childVFSD dropped by a corresponding Valid. + parent.insertChildLocked(name, childVFSD) + } + return childVFSD.Impl().(*Dentry), nil +} + +// walkExistingLocked resolves rp to an existing file. +// +// walkExistingLocked is loosely analogous to Linux's +// fs/namei.c:path_lookupat(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. +// +// Postconditions: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { + vfsd := rp.Start() + for !rp.Done() { + var err error + vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd) + if err != nil { + return nil, nil, err + } + } + d := vfsd.Impl().(*Dentry) + if rp.MustBeDir() && !d.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, d.inode, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory. It does not check that the returned directory is +// searchable by the provider of rp. +// +// walkParentDirLocked is loosely analogous to Linux's +// fs/namei.c:path_parentat(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// +// Postconditions: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { + vfsd := rp.Start() + for !rp.Final() { + var err error + vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd) + if err != nil { + return nil, nil, err + } + } + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, d.inode, nil +} + +// checkCreateLocked checks that a file named rp.Component() may be created in +// directory parentVFSD, then returns rp.Component(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. parentInode +// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true. +func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) { + if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return "", err + } + pc := rp.Component() + if pc == "." || pc == ".." { + return "", syserror.EEXIST + } + childVFSD, err := rp.ResolveChild(parentVFSD, pc) + if err != nil { + return "", err + } + if childVFSD != nil { + return "", syserror.EEXIST + } + if parentVFSD.IsDisowned() { + return "", syserror.ENOENT + } + return pc, nil +} + +// checkDeleteLocked checks that the file represented by vfsd may be deleted. +// +// Preconditions: Filesystem.mu must be locked for at least reading. +func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { + parentVFSD := vfsd.Parent() + if parentVFSD == nil { + return syserror.EBUSY + } + if parentVFSD.IsDisowned() { + return syserror.ENOENT + } + if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + return nil +} + +// checkRenameLocked checks that a rename operation may be performed on the +// target dentry across the given set of parent directories. The target dentry +// may be nil. +// +// Precondition: isDir(dstInode) == true. +func checkRenameLocked(creds *auth.Credentials, src, dstDir *vfs.Dentry, dstInode Inode) error { + srcDir := src.Parent() + if srcDir == nil { + return syserror.EBUSY + } + if srcDir.IsDisowned() { + return syserror.ENOENT + } + if dstDir.IsDisowned() { + return syserror.ENOENT + } + // Check for creation permissions on dst dir. + if err := dstInode.CheckPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + + return nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *Filesystem) Release() { +} + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *Filesystem) Sync(ctx context.Context) error { + // All filesystem state is in-memory. + return nil +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + if err != nil { + return nil, err + } + + if opts.CheckSearchable { + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + vfsd.IncRef() // Ownership transferred to caller. + return vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + vfsd, _, err := fs.walkParentDirLocked(ctx, rp) + if err != nil { + return nil, err + } + vfsd.IncRef() // Ownership transferred to caller. + return vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + + d := vd.Dentry().Impl().(*Dentry) + if d.isDir() { + return syserror.EPERM + } + + child, err := parentInode.NewLink(ctx, pc, d.inode) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return nil +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + child, err := parentInode.NewDir(ctx, pc, opts) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return nil +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + new, err := parentInode.NewNode(ctx, pc, opts) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, new) + return nil +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Filter out flags that are not supported by kernfs. O_DIRECTORY and + // O_NOFOLLOW have no effect here (they're handled by VFS by setting + // appropriate bits in rp), but are returned by + // FileDescriptionImpl.StatusFlags(). + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW + ats := vfs.AccessTypesForOpenFlags(opts.Flags) + + // Do not create new file. + if opts.Flags&linux.O_CREAT == 0 { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + if err != nil { + return nil, err + } + if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + return inode.Open(rp, vfsd, opts.Flags) + } + + // May create new file. + mustCreate := opts.Flags&linux.O_EXCL != 0 + vfsd := rp.Start() + inode := vfsd.Impl().(*Dentry).inode + fs.mu.Lock() + defer fs.mu.Unlock() + if rp.Done() { + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } + if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + return inode.Open(rp, vfsd, opts.Flags) + } +afterTrailingSymlink: + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + pc := rp.Component() + if pc == "." || pc == ".." { + return nil, syserror.EISDIR + } + // Determine whether or not we need to create a file. + childVFSD, err := rp.ResolveChild(parentVFSD, pc) + if err != nil { + return nil, err + } + if childVFSD == nil { + // Already checked for searchability above; now check for writability. + if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + defer rp.Mount().EndWrite() + // Create and open the child. + child, err := parentInode.NewFile(ctx, pc, opts) + if err != nil { + return nil, err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags) + } + // Open existing file or follow symlink. + if mustCreate { + return nil, syserror.EEXIST + } + childDentry := childVFSD.Impl().(*Dentry) + childInode := childDentry.inode + if rp.ShouldFollowSymlink() { + if childDentry.isSymlink() { + target, err := childInode.Readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + // rp.Final() may no longer be true since we now need to resolve the + // symlink target. + goto afterTrailingSymlink + } + } + if err := childInode.CheckPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + return childInode.Open(rp, childVFSD, opts.Flags) +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + fs.mu.RLock() + d, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return "", err + } + if !d.Impl().(*Dentry).isSymlink() { + return "", syserror.EINVAL + } + return inode.Readlink(ctx) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + // Only RENAME_NOREPLACE is supported. + if opts.Flags&^linux.RENAME_NOREPLACE != 0 { + return syserror.EINVAL + } + noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 + + fs.mu.Lock() + defer fs.mu.Lock() + + // Resolve the destination directory first to verify that it's on this + // Mount. + dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + srcDirVFSD := oldParentVD.Dentry() + srcDir := srcDirVFSD.Impl().(*Dentry) + srcDir.dirMu.Lock() + src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDirVFSD.Child(oldName)) + srcDir.dirMu.Unlock() + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + srcVFSD := &src.vfsd + + // Can we remove the src dentry? + if err := checkDeleteLocked(rp, srcVFSD); err != nil { + return err + } + + // Can we create the dst dentry? + var dstVFSD *vfs.Dentry + pc, err := checkCreateLocked(rp, dstDirVFSD, dstDirInode) + switch err { + case nil: + // Ok, continue with rename as replacement. + case syserror.EEXIST: + if noReplace { + // Won't overwrite existing node since RENAME_NOREPLACE was requested. + return syserror.EEXIST + } + dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc) + if err != nil { + panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD)) + } + default: + return err + } + + mntns := vfs.MountNamespaceFromContext(ctx) + virtfs := rp.VirtualFilesystem() + + srcDirDentry := srcDirVFSD.Impl().(*Dentry) + dstDirDentry := dstDirVFSD.Impl().(*Dentry) + + // We can't deadlock here due to lock ordering because we're protected from + // concurrent renames by fs.mu held for writing. + srcDirDentry.dirMu.Lock() + defer srcDirDentry.dirMu.Unlock() + dstDirDentry.dirMu.Lock() + defer dstDirDentry.dirMu.Unlock() + + if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { + return err + } + srcDirInode := srcDirDentry.inode + replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD) + if err != nil { + virtfs.AbortRenameDentry(srcVFSD, dstVFSD) + return err + } + virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(rp, vfsd); err != nil { + return err + } + if !vfsd.Impl().(*Dentry).isDir() { + return syserror.ENOTDIR + } + if inode.HasChildren() { + return syserror.ENOTEMPTY + } + virtfs := rp.VirtualFilesystem() + parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry.dirMu.Lock() + defer parentDentry.dirMu.Unlock() + if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + return err + } + if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { + virtfs.AbortDeleteDentry(vfsd) + return err + } + virtfs.CommitDeleteDentry(vfsd) + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + fs.mu.RLock() + _, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + if opts.Stat.Mask == 0 { + return nil + } + return inode.SetStat(fs.VFSFilesystem(), opts) +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + fs.mu.RLock() + _, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return linux.Statx{}, err + } + return inode.Stat(fs.VFSFilesystem()), nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return linux.Statfs{}, err + } + // TODO: actually implement statfs + return linux.Statfs{}, syserror.ENOSYS +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + child, err := parentInode.NewSymlink(ctx, pc, target) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return nil +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, _, err := fs.walkExistingLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(rp, vfsd); err != nil { + return err + } + if vfsd.Impl().(*Dentry).isDir() { + return syserror.EISDIR + } + virtfs := rp.VirtualFilesystem() + parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry.dirMu.Lock() + defer parentDentry.dirMu.Unlock() + if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + return err + } + if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { + virtfs.AbortDeleteDentry(vfsd) + return err + } + virtfs.CommitDeleteDentry(vfsd) + return nil +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return nil, err + } + // kernfs currently does not support extended attributes. + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return "", err + } + // kernfs currently does not support extended attributes. + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + // kernfs currently does not support extended attributes. + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + // kernfs currently does not support extended attributes. + return syserror.ENOTSUP +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go new file mode 100644 index 000000000..752e0f659 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -0,0 +1,512 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// InodeNoopRefCount partially implements the Inode interface, specifically the +// inodeRefs sub interface. InodeNoopRefCount implements a simple reference +// count for inodes, performing no extra actions when references are obtained or +// released. This is suitable for simple file inodes that don't reference any +// resources. +type InodeNoopRefCount struct { +} + +// IncRef implements Inode.IncRef. +func (n *InodeNoopRefCount) IncRef() { +} + +// DecRef implements Inode.DecRef. +func (n *InodeNoopRefCount) DecRef() { +} + +// TryIncRef implements Inode.TryIncRef. +func (n *InodeNoopRefCount) TryIncRef() bool { + return true +} + +// Destroy implements Inode.Destroy. +func (n *InodeNoopRefCount) Destroy() { +} + +// InodeDirectoryNoNewChildren partially implements the Inode interface. +// InodeDirectoryNoNewChildren represents a directory inode which does not +// support creation of new children. +type InodeDirectoryNoNewChildren struct{} + +// NewFile implements Inode.NewFile. +func (*InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewDir implements Inode.NewDir. +func (*InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewLink implements Inode.NewLink. +func (*InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewSymlink implements Inode.NewSymlink. +func (*InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewNode implements Inode.NewNode. +func (*InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// InodeNotDirectory partially implements the Inode interface, specifically the +// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not +// represent directories can embed this to provide no-op implementations for +// directory-related functions. +type InodeNotDirectory struct { +} + +// HasChildren implements Inode.HasChildren. +func (*InodeNotDirectory) HasChildren() bool { + return false +} + +// NewFile implements Inode.NewFile. +func (*InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { + panic("NewFile called on non-directory inode") +} + +// NewDir implements Inode.NewDir. +func (*InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { + panic("NewDir called on non-directory inode") +} + +// NewLink implements Inode.NewLinkink. +func (*InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { + panic("NewLink called on non-directory inode") +} + +// NewSymlink implements Inode.NewSymlink. +func (*InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + panic("NewSymlink called on non-directory inode") +} + +// NewNode implements Inode.NewNode. +func (*InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + panic("NewNode called on non-directory inode") +} + +// Unlink implements Inode.Unlink. +func (*InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error { + panic("Unlink called on non-directory inode") +} + +// RmDir implements Inode.RmDir. +func (*InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error { + panic("RmDir called on non-directory inode") +} + +// Rename implements Inode.Rename. +func (*InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) { + panic("Rename called on non-directory inode") +} + +// Lookup implements Inode.Lookup. +func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + panic("Lookup called on non-directory inode") +} + +// IterDirents implements Inode.IterDirents. +func (*InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { + panic("IterDirents called on non-directory inode") +} + +// Valid implements Inode.Valid. +func (*InodeNotDirectory) Valid(context.Context) bool { + return true +} + +// InodeNoDynamicLookup partially implements the Inode interface, specifically +// the inodeDynamicLookup sub interface. Directory inodes that do not support +// dymanic entries (i.e. entries that are not "hashed" into the +// vfs.Dentry.children) can embed this to provide no-op implementations for +// functions related to dynamic entries. +type InodeNoDynamicLookup struct{} + +// Lookup implements Inode.Lookup. +func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + return nil, syserror.ENOENT +} + +// IterDirents implements Inode.IterDirents. +func (*InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + return offset, nil +} + +// Valid implements Inode.Valid. +func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool { + return true +} + +// InodeNotSymlink partially implements the Inode interface, specifically the +// inodeSymlink sub interface. All inodes that are not symlinks may embed this +// to return the appropriate errors from symlink-related functions. +type InodeNotSymlink struct{} + +// Readlink implements Inode.Readlink. +func (*InodeNotSymlink) Readlink(context.Context) (string, error) { + return "", syserror.EINVAL +} + +// InodeAttrs partially implements the Inode interface, specifically the +// inodeMetadata sub interface. InodeAttrs provides functionality related to +// inode attributes. +// +// Must be initialized by Init prior to first use. +type InodeAttrs struct { + ino uint64 + mode uint32 + uid uint32 + gid uint32 + nlink uint32 +} + +// Init initializes this InodeAttrs. +func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMode) { + if mode.FileType() == 0 { + panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode)) + } + + nlink := uint32(1) + if mode.FileType() == linux.ModeDirectory { + nlink = 2 + } + atomic.StoreUint64(&a.ino, ino) + atomic.StoreUint32(&a.mode, uint32(mode)) + atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID)) + atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID)) + atomic.StoreUint32(&a.nlink, nlink) +} + +// Mode implements Inode.Mode. +func (a *InodeAttrs) Mode() linux.FileMode { + return linux.FileMode(atomic.LoadUint32(&a.mode)) +} + +// Stat partially implements Inode.Stat. Note that this function doesn't provide +// all the stat fields, and the embedder should consider extending the result +// with filesystem-specific fields. +func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx { + var stat linux.Statx + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK + stat.Ino = atomic.LoadUint64(&a.ino) + stat.Mode = uint16(a.Mode()) + stat.UID = atomic.LoadUint32(&a.uid) + stat.GID = atomic.LoadUint32(&a.gid) + stat.Nlink = atomic.LoadUint32(&a.nlink) + + // TODO: Implement other stat fields like timestamps. + + return stat +} + +// SetStat implements Inode.SetStat. +func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { + stat := opts.Stat + if stat.Mask&linux.STATX_MODE != 0 { + for { + old := atomic.LoadUint32(&a.mode) + new := old | uint32(stat.Mode & ^uint16(linux.S_IFMT)) + if swapped := atomic.CompareAndSwapUint32(&a.mode, old, new); swapped { + break + } + } + } + + if stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&a.uid, stat.UID) + } + if stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&a.gid, stat.GID) + } + + // Note that not all fields are modifiable. For example, the file type and + // inode numbers are immutable after node creation. + + // TODO: Implement other stat fields like timestamps. + + return nil +} + +// CheckPermissions implements Inode.CheckPermissions. +func (a *InodeAttrs) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + mode := a.Mode() + return vfs.GenericCheckPermissions( + creds, + ats, + mode.FileType() == linux.ModeDirectory, + uint16(mode), + auth.KUID(atomic.LoadUint32(&a.uid)), + auth.KGID(atomic.LoadUint32(&a.gid)), + ) +} + +// IncLinks implements Inode.IncLinks. +func (a *InodeAttrs) IncLinks(n uint32) { + if atomic.AddUint32(&a.nlink, n) <= n { + panic("InodeLink.IncLinks called with no existing links") + } +} + +// DecLinks implements Inode.DecLinks. +func (a *InodeAttrs) DecLinks() { + if nlink := atomic.AddUint32(&a.nlink, ^uint32(0)); nlink == ^uint32(0) { + // Negative overflow + panic("Inode.DecLinks called at 0 links") + } +} + +type slot struct { + Name string + Dentry *vfs.Dentry + slotEntry +} + +// OrderedChildrenOptions contains initialization options for OrderedChildren. +type OrderedChildrenOptions struct { + // Writable indicates whether vfs.FilesystemImpl methods implemented by + // OrderedChildren may modify the tracked children. This applies to + // operations related to rename, unlink and rmdir. If an OrderedChildren is + // not writable, these operations all fail with EPERM. + Writable bool +} + +// OrderedChildren partially implements the Inode interface. OrderedChildren can +// be embedded in directory inodes to keep track of the children in the +// directory, and can then be used to implement a generic directory FD -- see +// GenericDirectoryFD. OrderedChildren is not compatible with dynamic +// directories. +// +// Must be initialize with Init before first use. +type OrderedChildren struct { + refs.AtomicRefCount + + // Can children be modified by user syscalls? It set to false, interface + // methods that would modify the children return EPERM. Immutable. + writable bool + + mu sync.RWMutex + order slotList + set map[string]*slot +} + +// Init initializes an OrderedChildren. +func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { + o.writable = opts.Writable + o.set = make(map[string]*slot) +} + +// DecRef implements Inode.DecRef. +func (o *OrderedChildren) DecRef() { + o.AtomicRefCount.DecRefWithDestructor(o.Destroy) +} + +// Destroy cleans up resources referenced by this OrderedChildren. +func (o *OrderedChildren) Destroy() { + o.mu.Lock() + defer o.mu.Unlock() + o.order.Reset() + o.set = nil +} + +// Populate inserts children into this OrderedChildren, and d's dentry +// cache. Populate returns the number of directories inserted, which the caller +// may use to update the link count for the parent directory. +// +// Precondition: d.Impl() must be a kernfs Dentry. d must represent a directory +// inode. children must not contain any conflicting entries already in o. +func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 { + var links uint32 + for name, child := range children { + if child.isDir() { + links++ + } + if err := o.Insert(name, child.VFSDentry()); err != nil { + panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d)) + } + d.InsertChild(name, child.VFSDentry()) + } + return links +} + +// HasChildren implements Inode.HasChildren. +func (o *OrderedChildren) HasChildren() bool { + o.mu.RLock() + defer o.mu.RUnlock() + return len(o.set) > 0 +} + +// Insert inserts child into o. This ignores the writability of o, as this is +// not part of the vfs.FilesystemImpl interface, and is a lower-level operation. +func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error { + o.mu.Lock() + defer o.mu.Unlock() + if _, ok := o.set[name]; ok { + return syserror.EEXIST + } + s := &slot{ + Name: name, + Dentry: child, + } + o.order.PushBack(s) + o.set[name] = s + return nil +} + +// Precondition: caller must hold o.mu for writing. +func (o *OrderedChildren) removeLocked(name string) { + if s, ok := o.set[name]; ok { + delete(o.set, name) + o.order.Remove(s) + } +} + +// Precondition: caller must hold o.mu for writing. +func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry { + if s, ok := o.set[name]; ok { + // Existing slot with given name, simply replace the dentry. + var old *vfs.Dentry + old, s.Dentry = s.Dentry, new + return old + } + + // No existing slot with given name, create and hash new slot. + s := &slot{ + Name: name, + Dentry: new, + } + o.order.PushBack(s) + o.set[name] = s + return nil +} + +// Precondition: caller must hold o.mu for reading or writing. +func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error { + s, ok := o.set[name] + if !ok { + return syserror.ENOENT + } + if s.Dentry != child { + panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child)) + } + return nil +} + +// Unlink implements Inode.Unlink. +func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error { + if !o.writable { + return syserror.EPERM + } + o.mu.Lock() + defer o.mu.Unlock() + if err := o.checkExistingLocked(name, child); err != nil { + return err + } + o.removeLocked(name) + return nil +} + +// Rmdir implements Inode.Rmdir. +func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error { + // We're not responsible for checking that child is a directory, that it's + // empty, or updating any link counts; so this is the same as unlink. + return o.Unlink(ctx, name, child) +} + +type renameAcrossDifferentImplementationsError struct{} + +func (renameAcrossDifferentImplementationsError) Error() string { + return "rename across inodes with different implementations" +} + +// Rename implements Inode.Rename. +// +// Precondition: Rename may only be called across two directory inodes with +// identical implementations of Rename. Practically, this means filesystems that +// implement Rename by embedding OrderedChildren for any directory +// implementation must use OrderedChildren for all directory implementations +// that will support Rename. +// +// Postcondition: reference on any replaced dentry transferred to caller. +func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) { + dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren) + if !ok { + return nil, renameAcrossDifferentImplementationsError{} + } + if !o.writable || !dst.writable { + return nil, syserror.EPERM + } + // Note: There's a potential deadlock below if concurrent calls to Rename + // refer to the same src and dst directories in reverse. We avoid any + // ordering issues because the caller is required to serialize concurrent + // calls to Rename in accordance with the interface declaration. + o.mu.Lock() + defer o.mu.Unlock() + if dst != o { + dst.mu.Lock() + defer dst.mu.Unlock() + } + if err := o.checkExistingLocked(oldname, child); err != nil { + return nil, err + } + replaced := dst.replaceChildLocked(newname, child) + return replaced, nil +} + +// nthLocked returns an iterator to the nth child tracked by this object. The +// iterator is valid until the caller releases o.mu. Returns nil if the +// requested index falls out of bounds. +// +// Preconditon: Caller must hold o.mu for reading. +func (o *OrderedChildren) nthLocked(i int64) *slot { + for it := o.order.Front(); it != nil && i >= 0; it = it.Next() { + if i == 0 { + return it + } + i-- + } + return nil +} + +// InodeSymlink partially implements Inode interface for symlinks. +type InodeSymlink struct { + InodeNotDirectory +} + +// Open implements Inode.Open. +func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + return nil, syserror.ELOOP +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go new file mode 100644 index 000000000..d69b299ae --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -0,0 +1,422 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kernfs provides the tools to implement inode-based filesystems. +// Kernfs has two main features: +// +// 1. The Inode interface, which maps VFS2's path-based filesystem operations to +// specific filesystem nodes. Kernfs uses the Inode interface to provide a +// blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as +// the synchronization mechanism for all filesystem operations by holding a +// filesystem-wide lock across all operations. +// +// 2. Various utility types which provide generic implementations for various +// parts of the Inode and vfs.FileDescription interfaces. Client filesystems +// based on kernfs can embed the appropriate set of these to avoid having to +// reimplement common filesystem operations. See inode_impl_util.go and +// fd_impl_util.go. +// +// Reference Model: +// +// Kernfs dentries represents named pointers to inodes. Dentries and inode have +// independent lifetimes and reference counts. A child dentry unconditionally +// holds a reference on its parent directory's dentry. A dentry also holds a +// reference on the inode it points to. Multiple dentries can point to the same +// inode (for example, in the case of hardlinks). File descriptors hold a +// reference to the dentry they're opened on. +// +// Dentries are guaranteed to exist while holding Filesystem.mu for +// reading. Dropping dentries require holding Filesystem.mu for writing. To +// queue dentries for destruction from a read critical section, see +// Filesystem.deferDecRef. +// +// Lock ordering: +// +// kernfs.Filesystem.mu +// kernfs.Dentry.dirMu +// vfs.VirtualFilesystem.mountMu +// vfs.Dentry.mu +// kernfs.Filesystem.droppedDentriesMu +// (inode implementation locks, if any) +package kernfs + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory +// filesystem. Concrete implementations are expected to embed this in their own +// Filesystem type. +type Filesystem struct { + vfsfs vfs.Filesystem + + droppedDentriesMu sync.Mutex + + // droppedDentries is a list of dentries waiting to be DecRef()ed. This is + // used to defer dentry destruction until mu can be acquired for + // writing. Protected by droppedDentriesMu. + droppedDentries []*vfs.Dentry + + // mu synchronizes the lifetime of Dentries on this filesystem. Holding it + // for reading guarantees continued existence of any resolved dentries, but + // the dentry tree may be modified. + // + // Kernfs dentries can only be DecRef()ed while holding mu for writing. For + // example: + // + // fs.mu.Lock() + // defer fs.mu.Unlock() + // ... + // dentry1.DecRef() + // defer dentry2.DecRef() // Ok, will run before Unlock. + // + // If discarding dentries in a read context, use Filesystem.deferDecRef. For + // example: + // + // fs.mu.RLock() + // fs.mu.processDeferredDecRefs() + // defer fs.mu.RUnlock() + // ... + // fs.deferDecRef(dentry) + mu sync.RWMutex + + // nextInoMinusOne is used to to allocate inode numbers on this + // filesystem. Must be accessed by atomic operations. + nextInoMinusOne uint64 +} + +// deferDecRef defers dropping a dentry ref until the next call to +// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. +// +// Precondition: d must not already be pending destruction. +func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { + fs.droppedDentriesMu.Lock() + fs.droppedDentries = append(fs.droppedDentries, d) + fs.droppedDentriesMu.Unlock() +} + +// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the +// droppedDentries list. See comment on Filesystem.mu. +func (fs *Filesystem) processDeferredDecRefs() { + fs.mu.Lock() + fs.processDeferredDecRefsLocked() + fs.mu.Unlock() +} + +// Precondition: fs.mu must be held for writing. +func (fs *Filesystem) processDeferredDecRefsLocked() { + fs.droppedDentriesMu.Lock() + for _, d := range fs.droppedDentries { + d.DecRef() + } + fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. + fs.droppedDentriesMu.Unlock() +} + +// Init initializes a kernfs filesystem. This should be called from during +// vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding +// kernfs. +func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem) { + fs.vfsfs.Init(vfsObj, fs) +} + +// VFSFilesystem returns the generic vfs filesystem object. +func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem { + return &fs.vfsfs +} + +// NextIno allocates a new inode number on this filesystem. +func (fs *Filesystem) NextIno() uint64 { + return atomic.AddUint64(&fs.nextInoMinusOne, 1) +} + +// These consts are used in the Dentry.flags field. +const ( + // Dentry points to a directory inode. + dflagsIsDir = 1 << iota + + // Dentry points to a symlink inode. + dflagsIsSymlink +) + +// Dentry implements vfs.DentryImpl. +// +// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a +// named reference to an inode. A dentry generally lives as long as it's part of +// a mounted filesystem tree. Kernfs doesn't cache dentries once all references +// to them are removed. Dentries hold a single reference to the inode they point +// to, and child dentries hold a reference on their parent. +// +// Must be initialized by Init prior to first use. +type Dentry struct { + refs.AtomicRefCount + + vfsd vfs.Dentry + inode Inode + + refs uint64 + + // flags caches useful information about the dentry from the inode. See the + // dflags* consts above. Must be accessed by atomic ops. + flags uint32 + + // dirMu protects vfsd.children for directory dentries. + dirMu sync.Mutex +} + +// Init initializes this dentry. +// +// Precondition: Caller must hold a reference on inode. +// +// Postcondition: Caller's reference on inode is transferred to the dentry. +func (d *Dentry) Init(inode Inode) { + d.vfsd.Init(d) + d.inode = inode + ftype := inode.Mode().FileType() + if ftype == linux.ModeDirectory { + d.flags |= dflagsIsDir + } + if ftype == linux.ModeSymlink { + d.flags |= dflagsIsSymlink + } +} + +// VFSDentry returns the generic vfs dentry for this kernfs dentry. +func (d *Dentry) VFSDentry() *vfs.Dentry { + return &d.vfsd +} + +// isDir checks whether the dentry points to a directory inode. +func (d *Dentry) isDir() bool { + return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0 +} + +// isSymlink checks whether the dentry points to a symlink inode. +func (d *Dentry) isSymlink() bool { + return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0 +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *Dentry) DecRef() { + d.AtomicRefCount.DecRefWithDestructor(d.destroy) +} + +// Precondition: Dentry must be removed from VFS' dentry cache. +func (d *Dentry) destroy() { + d.inode.DecRef() // IncRef from Init. + d.inode = nil + if parent := d.vfsd.Parent(); parent != nil { + parent.DecRef() // IncRef from Dentry.InsertChild. + } +} + +// InsertChild inserts child into the vfs dentry cache with the given name under +// this dentry. This does not update the directory inode, so calling this on +// it's own isn't sufficient to insert a child into a directory. InsertChild +// updates the link count on d if required. +// +// Precondition: d must represent a directory inode. +func (d *Dentry) InsertChild(name string, child *vfs.Dentry) { + d.dirMu.Lock() + d.insertChildLocked(name, child) + d.dirMu.Unlock() +} + +// insertChildLocked is equivalent to InsertChild, with additional +// preconditions. +// +// Precondition: d.dirMu must be locked. +func (d *Dentry) insertChildLocked(name string, child *vfs.Dentry) { + if !d.isDir() { + panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) + } + vfsDentry := d.VFSDentry() + vfsDentry.IncRef() // DecRef in child's Dentry.destroy. + vfsDentry.InsertChild(child, name) +} + +// The Inode interface maps filesystem-level operations that operate on paths to +// equivalent operations on specific filesystem nodes. +// +// The interface methods are groups into logical categories as sub interfaces +// below. Generally, an implementation for each sub interface can be provided by +// embedding an appropriate type from inode_impl_utils.go. The sub interfaces +// are purely organizational. Methods declared directly in the main interface +// have no generic implementations, and should be explicitly provided by the +// client filesystem. +// +// Generally, implementations are not responsible for tasks that are common to +// all filesystems. These include: +// +// - Checking that dentries passed to methods are of the appropriate file type. +// - Checking permissions. +// - Updating link and reference counts. +// +// Specific responsibilities of implementations are documented below. +type Inode interface { + // Methods related to reference counting. A generic implementation is + // provided by InodeNoopRefCount. These methods are generally called by the + // equivalent Dentry methods. + inodeRefs + + // Methods related to node metadata. A generic implementation is provided by + // InodeAttrs. + inodeMetadata + + // Method for inodes that represent symlink. InodeNotSymlink provides a + // blanket implementation for all non-symlink inodes. + inodeSymlink + + // Method for inodes that represent directories. InodeNotDirectory provides + // a blanket implementation for all non-directory inodes. + inodeDirectory + + // Method for inodes that represent dynamic directories and their + // children. InodeNoDynamicLookup provides a blanket implementation for all + // non-dynamic-directory inodes. + inodeDynamicLookup + + // Open creates a file description for the filesystem object represented by + // this inode. The returned file description should hold a reference on the + // inode for its lifetime. + // + // Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry. + Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) +} + +type inodeRefs interface { + IncRef() + DecRef() + TryIncRef() bool + // Destroy is called when the inode reaches zero references. Destroy release + // all resources (references) on objects referenced by the inode, including + // any child dentries. + Destroy() +} + +type inodeMetadata interface { + // CheckPermissions checks that creds may access this inode for the + // requested access type, per the the rules of + // fs/namei.c:generic_permission(). + CheckPermissions(creds *auth.Credentials, atx vfs.AccessTypes) error + + // Mode returns the (struct stat)::st_mode value for this inode. This is + // separated from Stat for performance. + Mode() linux.FileMode + + // Stat returns the metadata for this inode. This corresponds to + // vfs.FilesystemImpl.StatAt. + Stat(fs *vfs.Filesystem) linux.Statx + + // SetStat updates the metadata for this inode. This corresponds to + // vfs.FilesystemImpl.SetStatAt. + SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error +} + +// Precondition: All methods in this interface may only be called on directory +// inodes. +type inodeDirectory interface { + // The New{File,Dir,Node,Symlink} methods below should return a new inode + // hashed into this inode. + // + // These inode constructors are inode-level operations rather than + // filesystem-level operations to allow client filesystems to mix different + // implementations based on the new node's location in the + // filesystem. + + // HasChildren returns true if the directory inode has any children. + HasChildren() bool + + // NewFile creates a new regular file inode. + NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) + + // NewDir creates a new directory inode. + NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) + + // NewLink creates a new hardlink to a specified inode in this + // directory. Implementations should create a new kernfs Dentry pointing to + // target, and update target's link count. + NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error) + + // NewSymlink creates a new symbolic link inode. + NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) + + // NewNode creates a new filesystem node for a mknod syscall. + NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) + + // Unlink removes a child dentry from this directory inode. + Unlink(ctx context.Context, name string, child *vfs.Dentry) error + + // RmDir removes an empty child directory from this directory + // inode. Implementations must update the parent directory's link count, + // if required. Implementations are not responsible for checking that child + // is a directory, checking for an empty directory. + RmDir(ctx context.Context, name string, child *vfs.Dentry) error + + // Rename is called on the source directory containing an inode being + // renamed. child should point to the resolved child in the source + // directory. If Rename replaces a dentry in the destination directory, it + // should return the replaced dentry or nil otherwise. + // + // Precondition: Caller must serialize concurrent calls to Rename. + Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error) +} + +type inodeDynamicLookup interface { + // Lookup should return an appropriate dentry if name should resolve to a + // child of this dynamic directory inode. This gives the directory an + // opportunity on every lookup to resolve additional entries that aren't + // hashed into the directory. This is only called when the inode is a + // directory. If the inode is not a directory, or if the directory only + // contains a static set of children, the implementer can unconditionally + // return an appropriate error (ENOTDIR and ENOENT respectively). + // + // The child returned by Lookup will be hashed into the VFS dentry tree. Its + // lifetime can be controlled by the filesystem implementation with an + // appropriate implementation of Valid. + // + // Lookup returns the child with an extra reference and the caller owns this + // reference. + Lookup(ctx context.Context, name string) (*vfs.Dentry, error) + + // Valid should return true if this inode is still valid, or needs to + // be resolved again by a call to Lookup. + Valid(ctx context.Context) bool + + // IterDirents is used to iterate over dynamically created entries. It invokes + // cb on each entry in the directory represented by the FileDescription. + // 'offset' is the offset for the entire IterDirents call, which may include + // results from the caller. 'relOffset' is the offset inside the entries + // returned by this IterDirents invocation. In other words, + // 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff, + // while 'relOffset' is the place where iteration should start from. + IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) +} + +type inodeSymlink interface { + // Readlink resolves the target of a symbolic link. If an inode is not a + // symlink, the implementation should return EINVAL. + Readlink(ctx context.Context) (string, error) +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go new file mode 100644 index 000000000..4b6b95f5f --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -0,0 +1,426 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs_test + +import ( + "bytes" + "fmt" + "io" + "runtime" + "sync" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const defaultMode linux.FileMode = 01777 +const staticFileContent = "This is sample content for a static test file." + +// RootDentryFn is a generator function for creating the root dentry of a test +// filesystem. See newTestSystem. +type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry + +// TestSystem represents the context for a single test. +type TestSystem struct { + t *testing.T + ctx context.Context + creds *auth.Credentials + vfs *vfs.VirtualFilesystem + mns *vfs.MountNamespace + root vfs.VirtualDentry +} + +// newTestSystem sets up a minimal environment for running a test, including an +// instance of a test filesystem. Tests can control the contents of the +// filesystem by providing an appropriate rootFn, which should return a +// pre-populated root dentry. +func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem { + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + v := vfs.New() + v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("Failed to create testfs root mount: %v", err) + } + + s := &TestSystem{ + t: t, + ctx: ctx, + creds: creds, + vfs: v, + mns: mns, + root: mns.Root(), + } + runtime.SetFinalizer(s, func(s *TestSystem) { s.root.DecRef() }) + return s +} + +// PathOpAtRoot constructs a vfs.PathOperation for a path from the +// root of the test filesystem. +// +// Precondition: path should be relative path. +func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation { + return vfs.PathOperation{ + Root: s.root, + Start: s.root, + Path: fspath.Parse(path), + } +} + +// GetDentryOrDie attempts to resolve a dentry referred to by the +// provided path operation. If unsuccessful, the test fails. +func (s *TestSystem) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry { + vd, err := s.vfs.GetDentryAt(s.ctx, s.creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err) + } + return vd +} + +func (s *TestSystem) ReadToEnd(fd *vfs.FileDescription) (string, error) { + buf := make([]byte, usermem.PageSize) + bufIOSeq := usermem.BytesIOSequence(buf) + opts := vfs.ReadOptions{} + + var content bytes.Buffer + for { + n, err := fd.Impl().Read(s.ctx, bufIOSeq, opts) + if n == 0 || err != nil { + if err == io.EOF { + err = nil + } + return content.String(), err + } + content.Write(buf[:n]) + } +} + +type fsType struct { + rootFn RootDentryFn +} + +type filesystem struct { + kernfs.Filesystem +} + +type file struct { + kernfs.DynamicBytesFile + content string +} + +func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { + f := &file{} + f.content = content + f.DynamicBytesFile.Init(creds, fs.NextIno(), f, 0777) + + d := &kernfs.Dentry{} + d.Init(f) + return d +} + +func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%s", f.content) + return nil +} + +type attrs struct { + kernfs.InodeAttrs +} + +func (a *attrs) SetStat(fs *vfs.Filesystem, opt vfs.SetStatOptions) error { + return syserror.EPERM +} + +type readonlyDir struct { + attrs + kernfs.InodeNotSymlink + kernfs.InodeNoDynamicLookup + kernfs.InodeDirectoryNoNewChildren + + kernfs.OrderedChildren + dentry kernfs.Dentry +} + +func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { + dir := &readonlyDir{} + dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + dir.dentry.Init(dir) + + dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) + + return &dir.dentry +} + +func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +type dir struct { + attrs + kernfs.InodeNotSymlink + kernfs.InodeNoDynamicLookup + + fs *filesystem + dentry kernfs.Dentry + kernfs.OrderedChildren +} + +func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { + dir := &dir{} + dir.fs = fs + dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) + dir.dentry.Init(dir) + + dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) + + return &dir.dentry +} + +func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) { + creds := auth.CredentialsFromContext(ctx) + dir := d.fs.newDir(creds, opts.Mode, nil) + dirVFSD := dir.VFSDentry() + if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil { + dir.DecRef() + return nil, err + } + d.IncLinks(1) + return dirVFSD, nil +} + +func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) { + creds := auth.CredentialsFromContext(ctx) + f := d.fs.newFile(creds, "") + fVFSD := f.VFSDentry() + if err := d.OrderedChildren.Insert(name, fVFSD); err != nil { + f.DecRef() + return nil, err + } + return fVFSD, nil +} + +func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (fst *fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + fs := &filesystem{} + fs.Init(vfsObj) + root := fst.rootFn(creds, fs) + return fs.VFSFilesystem(), root.VFSDentry(), nil +} + +// -------------------- Remainder of the file are test cases -------------------- + +func TestBasic(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "file1": fs.newFile(creds, staticFileContent), + }) + }) + sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef() +} + +func TestMkdirGetDentry(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir1": fs.newDir(creds, 0755, nil), + }) + }) + + pop := sys.PathOpAtRoot("dir1/a new directory") + if err := sys.vfs.MkdirAt(sys.ctx, sys.creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { + t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err) + } + sys.GetDentryOrDie(pop).DecRef() +} + +func TestReadStaticFile(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "file1": fs.newFile(creds, staticFileContent), + }) + }) + + pop := sys.PathOpAtRoot("file1") + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + content, err := sys.ReadToEnd(fd) + if err != nil { + sys.t.Fatalf("Read failed: %v", err) + } + if diff := cmp.Diff(staticFileContent, content); diff != "" { + sys.t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff) + } +} + +func TestCreateNewFileInStaticDir(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir1": fs.newDir(creds, 0755, nil), + }) + }) + + pop := sys.PathOpAtRoot("dir1/newfile") + opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode} + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, opts) + if err != nil { + sys.t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err) + } + + // Close the file. The file should persist. + fd.DecRef() + + fd, err = sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) + } + fd.DecRef() +} + +// direntCollector provides an implementation for vfs.IterDirentsCallback for +// testing. It simply iterates to the end of a given directory FD and collects +// all dirents emitted by the callback. +type direntCollector struct { + mu sync.Mutex + dirents map[string]vfs.Dirent +} + +// Handle implements vfs.IterDirentsCallback.Handle. +func (d *direntCollector) Handle(dirent vfs.Dirent) bool { + d.mu.Lock() + if d.dirents == nil { + d.dirents = make(map[string]vfs.Dirent) + } + d.dirents[dirent.Name] = dirent + d.mu.Unlock() + return true +} + +// count returns the number of dirents currently in the collector. +func (d *direntCollector) count() int { + d.mu.Lock() + defer d.mu.Unlock() + return len(d.dirents) +} + +// contains checks whether the collector has a dirent with the given name and +// type. +func (d *direntCollector) contains(name string, typ uint8) error { + d.mu.Lock() + defer d.mu.Unlock() + dirent, ok := d.dirents[name] + if !ok { + return fmt.Errorf("No dirent named %q found", name) + } + if dirent.Type != typ { + return fmt.Errorf("Dirent named %q found, but was expecting type %d, got: %+v", name, typ, dirent) + } + return nil +} + +func TestDirFDReadWrite(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, nil) + }) + + pop := sys.PathOpAtRoot("/") + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + // Read/Write should fail for directory FDs. + if _, err := fd.Read(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { + sys.t.Fatalf("Read for directory FD failed with unexpected error: %v", err) + } + if _, err := fd.Write(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR { + sys.t.Fatalf("Wrire for directory FD failed with unexpected error: %v", err) + } +} + +func TestDirFDIterDirents(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + // Fill root with nodes backed by various inode implementations. + "dir1": fs.newReadonlyDir(creds, 0755, nil), + "dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir3": fs.newDir(creds, 0755, nil), + }), + "file1": fs.newFile(creds, staticFileContent), + }) + }) + + pop := sys.PathOpAtRoot("/") + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + collector := &direntCollector{} + if err := fd.IterDirents(sys.ctx, collector); err != nil { + sys.t.Fatalf("IterDirent failed: %v", err) + } + + // Root directory should contain ".", ".." and 3 children: + if collector.count() != 5 { + sys.t.Fatalf("IterDirent returned too many dirents") + } + for _, dirName := range []string{".", "..", "dir1", "dir2"} { + if err := collector.contains(dirName, linux.DT_DIR); err != nil { + sys.t.Fatalf("IterDirent had unexpected results: %v", err) + } + } + if err := collector.contains("file1", linux.DT_REG); err != nil { + sys.t.Fatalf("IterDirent had unexpected results: %v", err) + } + +} diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go new file mode 100644 index 000000000..068063f4e --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -0,0 +1,45 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +type staticSymlink struct { + InodeAttrs + InodeNoopRefCount + InodeSymlink + + target string +} + +var _ Inode = (*staticSymlink)(nil) + +// NewStaticSymlink creates a new symlink file pointing to 'target'. +func NewStaticSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, target string) *Dentry { + inode := &staticSymlink{target: target} + inode.Init(creds, ino, linux.ModeSymlink|perm) + + d := &Dentry{} + d.Init(inode) + return d +} + +func (s *staticSymlink) Readlink(_ context.Context) (string, error) { + return s.target, nil +} diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go deleted file mode 100644 index 08a9cb8ef..000000000 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ /dev/null @@ -1,584 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package memfs - -import ( - "fmt" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -// stepLocked resolves rp.Component() in parent directory vfsd. -// -// stepLocked is loosely analogous to fs/namei.c:walk_component(). -// -// Preconditions: filesystem.mu must be locked. !rp.Done(). inode == -// vfsd.Impl().(*dentry).inode. -func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) { - if !inode.isDir() { - return nil, nil, syserror.ENOTDIR - } - if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { - return nil, nil, err - } -afterSymlink: - nextVFSD, err := rp.ResolveComponent(vfsd) - if err != nil { - return nil, nil, err - } - if nextVFSD == nil { - // Since the Dentry tree is the sole source of truth for memfs, if it's - // not in the Dentry tree, it doesn't exist. - return nil, nil, syserror.ENOENT - } - nextInode := nextVFSD.Impl().(*dentry).inode - if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO: symlink traversals update access time - if err := rp.HandleSymlink(symlink.target); err != nil { - return nil, nil, err - } - goto afterSymlink // don't check the current directory again - } - rp.Advance() - return nextVFSD, nextInode, nil -} - -// walkExistingLocked resolves rp to an existing file. -// -// walkExistingLocked is loosely analogous to Linux's -// fs/namei.c:path_lookupat(). -// -// Preconditions: filesystem.mu must be locked. -func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) { - vfsd := rp.Start() - inode := vfsd.Impl().(*dentry).inode - for !rp.Done() { - var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode) - if err != nil { - return nil, nil, err - } - } - if rp.MustBeDir() && !inode.isDir() { - return nil, nil, syserror.ENOTDIR - } - return vfsd, inode, nil -} - -// walkParentDirLocked resolves all but the last path component of rp to an -// existing directory. It does not check that the returned directory is -// searchable by the provider of rp. -// -// walkParentDirLocked is loosely analogous to Linux's -// fs/namei.c:path_parentat(). -// -// Preconditions: filesystem.mu must be locked. !rp.Done(). -func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) { - vfsd := rp.Start() - inode := vfsd.Impl().(*dentry).inode - for !rp.Final() { - var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode) - if err != nil { - return nil, nil, err - } - } - if !inode.isDir() { - return nil, nil, syserror.ENOTDIR - } - return vfsd, inode, nil -} - -// checkCreateLocked checks that a file named rp.Component() may be created in -// directory parentVFSD, then returns rp.Component(). -// -// Preconditions: filesystem.mu must be locked. parentInode == -// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true. -func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) { - if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { - return "", err - } - pc := rp.Component() - if pc == "." || pc == ".." { - return "", syserror.EEXIST - } - childVFSD, err := rp.ResolveChild(parentVFSD, pc) - if err != nil { - return "", err - } - if childVFSD != nil { - return "", syserror.EEXIST - } - if parentVFSD.IsDisowned() { - return "", syserror.ENOENT - } - return pc, nil -} - -// checkDeleteLocked checks that the file represented by vfsd may be deleted. -func checkDeleteLocked(vfsd *vfs.Dentry) error { - parentVFSD := vfsd.Parent() - if parentVFSD == nil { - return syserror.EBUSY - } - if parentVFSD.IsDisowned() { - return syserror.ENOENT - } - return nil -} - -// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. -func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { - fs.mu.RLock() - defer fs.mu.RUnlock() - vfsd, inode, err := walkExistingLocked(rp) - if err != nil { - return nil, err - } - if opts.CheckSearchable { - if !inode.isDir() { - return nil, syserror.ENOTDIR - } - if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { - return nil, err - } - } - inode.incRef() - return vfsd, nil -} - -// LinkAt implements vfs.FilesystemImpl.LinkAt. -func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - if rp.Done() { - return syserror.EEXIST - } - fs.mu.Lock() - defer fs.mu.Unlock() - parentVFSD, parentInode, err := walkParentDirLocked(rp) - if err != nil { - return err - } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) - if err != nil { - return err - } - if rp.Mount() != vd.Mount() { - return syserror.EXDEV - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - d := vd.Dentry().Impl().(*dentry) - if d.inode.isDir() { - return syserror.EPERM - } - d.inode.incLinksLocked() - child := fs.newDentry(d.inode) - parentVFSD.InsertChild(&child.vfsd, pc) - parentInode.impl.(*directory).childList.PushBack(child) - return nil -} - -// MkdirAt implements vfs.FilesystemImpl.MkdirAt. -func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - if rp.Done() { - return syserror.EEXIST - } - fs.mu.Lock() - defer fs.mu.Unlock() - parentVFSD, parentInode, err := walkParentDirLocked(rp) - if err != nil { - return err - } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) - if err != nil { - return err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode)) - parentVFSD.InsertChild(&child.vfsd, pc) - parentInode.impl.(*directory).childList.PushBack(child) - parentInode.incLinksLocked() // from child's ".." - return nil -} - -// MknodAt implements vfs.FilesystemImpl.MknodAt. -func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - if rp.Done() { - return syserror.EEXIST - } - fs.mu.Lock() - defer fs.mu.Unlock() - parentVFSD, parentInode, err := walkParentDirLocked(rp) - if err != nil { - return err - } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) - if err != nil { - return err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - - switch opts.Mode.FileType() { - case 0: - // "Zero file type is equivalent to type S_IFREG." - mknod(2) - fallthrough - case linux.ModeRegular: - // TODO(b/138862511): Implement. - return syserror.EINVAL - - case linux.ModeNamedPipe: - child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode)) - parentVFSD.InsertChild(&child.vfsd, pc) - parentInode.impl.(*directory).childList.PushBack(child) - return nil - - case linux.ModeSocket: - // TODO(b/138862511): Implement. - return syserror.EINVAL - - case linux.ModeCharacterDevice: - fallthrough - case linux.ModeBlockDevice: - // TODO(b/72101894): We don't support creating block or character - // devices at the moment. - // - // When we start supporting block and character devices, we'll - // need to check for CAP_MKNOD here. - return syserror.EPERM - - default: - // "EINVAL - mode requested creation of something other than a - // regular file, device special file, FIFO or socket." - mknod(2) - return syserror.EINVAL - } -} - -// OpenAt implements vfs.FilesystemImpl.OpenAt. -func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - // Filter out flags that are not supported by memfs. O_DIRECTORY and - // O_NOFOLLOW have no effect here (they're handled by VFS by setting - // appropriate bits in rp), but are returned by - // FileDescriptionImpl.StatusFlags(). O_NONBLOCK is supported only by - // pipes. - opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK - - if opts.Flags&linux.O_CREAT == 0 { - fs.mu.RLock() - defer fs.mu.RUnlock() - vfsd, inode, err := walkExistingLocked(rp) - if err != nil { - return nil, err - } - return inode.open(ctx, rp, vfsd, opts.Flags, false) - } - - mustCreate := opts.Flags&linux.O_EXCL != 0 - vfsd := rp.Start() - inode := vfsd.Impl().(*dentry).inode - fs.mu.Lock() - defer fs.mu.Unlock() - if rp.Done() { - if rp.MustBeDir() { - return nil, syserror.EISDIR - } - if mustCreate { - return nil, syserror.EEXIST - } - return inode.open(ctx, rp, vfsd, opts.Flags, false) - } -afterTrailingSymlink: - // Walk to the parent directory of the last path component. - for !rp.Final() { - var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode) - if err != nil { - return nil, err - } - } - if !inode.isDir() { - return nil, syserror.ENOTDIR - } - // Check for search permission in the parent directory. - if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { - return nil, err - } - // Reject attempts to open directories with O_CREAT. - if rp.MustBeDir() { - return nil, syserror.EISDIR - } - pc := rp.Component() - if pc == "." || pc == ".." { - return nil, syserror.EISDIR - } - // Determine whether or not we need to create a file. - childVFSD, err := rp.ResolveChild(vfsd, pc) - if err != nil { - return nil, err - } - if childVFSD == nil { - // Already checked for searchability above; now check for writability. - if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { - return nil, err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return nil, err - } - defer rp.Mount().EndWrite() - // Create and open the child. - childInode := fs.newRegularFile(rp.Credentials(), opts.Mode) - child := fs.newDentry(childInode) - vfsd.InsertChild(&child.vfsd, pc) - inode.impl.(*directory).childList.PushBack(child) - return childInode.open(ctx, rp, &child.vfsd, opts.Flags, true) - } - // Open existing file or follow symlink. - if mustCreate { - return nil, syserror.EEXIST - } - childInode := childVFSD.Impl().(*dentry).inode - if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO: symlink traversals update access time - if err := rp.HandleSymlink(symlink.target); err != nil { - return nil, err - } - // rp.Final() may no longer be true since we now need to resolve the - // symlink target. - goto afterTrailingSymlink - } - return childInode.open(ctx, rp, childVFSD, opts.Flags, false) -} - -func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(flags) - if !afterCreate { - if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil { - return nil, err - } - } - mnt := rp.Mount() - switch impl := i.impl.(type) { - case *regularFile: - var fd regularFileFD - fd.flags = flags - fd.readable = vfs.MayReadFileWithOpenFlags(flags) - fd.writable = vfs.MayWriteFileWithOpenFlags(flags) - if fd.writable { - if err := mnt.CheckBeginWrite(); err != nil { - return nil, err - } - // mnt.EndWrite() is called by regularFileFD.Release(). - } - mnt.IncRef() - vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) - if flags&linux.O_TRUNC != 0 { - impl.mu.Lock() - impl.data = impl.data[:0] - atomic.StoreInt64(&impl.dataLen, 0) - impl.mu.Unlock() - } - return &fd.vfsfd, nil - case *directory: - // Can't open directories writably. - if ats&vfs.MayWrite != 0 { - return nil, syserror.EISDIR - } - var fd directoryFD - mnt.IncRef() - vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) - fd.flags = flags - return &fd.vfsfd, nil - case *symlink: - // Can't open symlinks without O_PATH (which is unimplemented). - return nil, syserror.ELOOP - case *namedPipe: - return newNamedPipeFD(ctx, impl, rp, vfsd, flags) - default: - panic(fmt.Sprintf("unknown inode type: %T", i.impl)) - } -} - -// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. -func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { - fs.mu.RLock() - _, inode, err := walkExistingLocked(rp) - fs.mu.RUnlock() - if err != nil { - return "", err - } - symlink, ok := inode.impl.(*symlink) - if !ok { - return "", syserror.EINVAL - } - return symlink.target, nil -} - -// RenameAt implements vfs.FilesystemImpl.RenameAt. -func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { - if rp.Done() { - return syserror.ENOENT - } - fs.mu.Lock() - defer fs.mu.Unlock() - parentVFSD, parentInode, err := walkParentDirLocked(rp) - if err != nil { - return err - } - _, err = checkCreateLocked(rp, parentVFSD, parentInode) - if err != nil { - return err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - // TODO: actually implement RenameAt - return syserror.EPERM -} - -// RmdirAt implements vfs.FilesystemImpl.RmdirAt. -func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { - fs.mu.Lock() - defer fs.mu.Unlock() - vfsd, inode, err := walkExistingLocked(rp) - if err != nil { - return err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - if err := checkDeleteLocked(vfsd); err != nil { - return err - } - if !inode.isDir() { - return syserror.ENOTDIR - } - if vfsd.HasChildren() { - return syserror.ENOTEMPTY - } - if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { - return err - } - // Remove from parent directory's childList. - vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry)) - inode.decRef() - return nil -} - -// SetStatAt implements vfs.FilesystemImpl.SetStatAt. -func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { - fs.mu.RLock() - _, _, err := walkExistingLocked(rp) - fs.mu.RUnlock() - if err != nil { - return err - } - if opts.Stat.Mask == 0 { - return nil - } - // TODO: implement inode.setStat - return syserror.EPERM -} - -// StatAt implements vfs.FilesystemImpl.StatAt. -func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { - fs.mu.RLock() - _, inode, err := walkExistingLocked(rp) - fs.mu.RUnlock() - if err != nil { - return linux.Statx{}, err - } - var stat linux.Statx - inode.statTo(&stat) - return stat, nil -} - -// StatFSAt implements vfs.FilesystemImpl.StatFSAt. -func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { - fs.mu.RLock() - _, _, err := walkExistingLocked(rp) - fs.mu.RUnlock() - if err != nil { - return linux.Statfs{}, err - } - // TODO: actually implement statfs - return linux.Statfs{}, syserror.ENOSYS -} - -// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. -func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - if rp.Done() { - return syserror.EEXIST - } - fs.mu.Lock() - defer fs.mu.Unlock() - parentVFSD, parentInode, err := walkParentDirLocked(rp) - if err != nil { - return err - } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) - if err != nil { - return err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) - parentVFSD.InsertChild(&child.vfsd, pc) - parentInode.impl.(*directory).childList.PushBack(child) - return nil -} - -// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. -func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { - fs.mu.Lock() - defer fs.mu.Unlock() - vfsd, inode, err := walkExistingLocked(rp) - if err != nil { - return err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - if err := checkDeleteLocked(vfsd); err != nil { - return err - } - if inode.isDir() { - return syserror.EISDIR - } - if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { - return err - } - // Remove from parent directory's childList. - vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry)) - inode.decLinksLocked() - return nil -} diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go deleted file mode 100644 index b7f4853b3..000000000 --- a/pkg/sentry/fsimpl/memfs/regular_file.go +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package memfs - -import ( - "io" - "sync" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -type regularFile struct { - inode inode - - mu sync.RWMutex - data []byte - // dataLen is len(data), but accessed using atomic memory operations to - // avoid locking in inode.stat(). - dataLen int64 -} - -func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { - file := ®ularFile{} - file.inode.init(file, fs, creds, mode) - file.inode.nlink = 1 // from parent directory - return &file.inode -} - -type regularFileFD struct { - fileDescription - - // These are immutable. - readable bool - writable bool - - // off is the file offset. off is accessed using atomic memory operations. - // offMu serializes operations that may mutate off. - off int64 - offMu sync.Mutex -} - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() { - if fd.writable { - fd.vfsfd.VirtualDentry().Mount().EndWrite() - } -} - -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if !fd.readable { - return 0, syserror.EINVAL - } - f := fd.inode().impl.(*regularFile) - f.mu.RLock() - if offset >= int64(len(f.data)) { - f.mu.RUnlock() - return 0, io.EOF - } - n, err := dst.CopyOut(ctx, f.data[offset:]) - f.mu.RUnlock() - return int64(n), err -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - fd.offMu.Lock() - n, err := fd.PRead(ctx, dst, fd.off, opts) - fd.off += n - fd.offMu.Unlock() - return n, err -} - -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - if !fd.writable { - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL - } - srclen := src.NumBytes() - if srclen == 0 { - return 0, nil - } - f := fd.inode().impl.(*regularFile) - f.mu.Lock() - end := offset + srclen - if end < offset { - // Overflow. - f.mu.Unlock() - return 0, syserror.EFBIG - } - if end > f.dataLen { - f.data = append(f.data, make([]byte, end-f.dataLen)...) - atomic.StoreInt64(&f.dataLen, end) - } - n, err := src.CopyIn(ctx, f.data[offset:end]) - f.mu.Unlock() - return int64(n), err -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - fd.offMu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n - fd.offMu.Unlock() - return n, err -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fd.offMu.Lock() - defer fd.offMu.Unlock() - switch whence { - case linux.SEEK_SET: - // use offset as specified - case linux.SEEK_CUR: - offset += fd.off - case linux.SEEK_END: - offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen) - default: - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL - } - fd.off = offset - return offset, nil -} - -// Sync implements vfs.FileDescriptionImpl.Sync. -func (fd *regularFileFD) Sync(ctx context.Context) error { - return nil -} diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index ade6ac946..1f44b3217 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -6,15 +6,17 @@ package(licenses = ["notice"]) go_library( name = "proc", srcs = [ - "filesystems.go", + "filesystem.go", "loadavg.go", "meminfo.go", "mounts.go", "net.go", - "proc.go", "stat.go", "sys.go", "task.go", + "task_files.go", + "tasks.go", + "tasks_files.go", "version.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc", @@ -24,8 +26,10 @@ go_library( "//pkg/log", "//pkg/sentry/context", "//pkg/sentry/fs", + "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/sentry/limits", "//pkg/sentry/mm", "//pkg/sentry/socket", @@ -34,17 +38,40 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/syserror", ], ) go_test( name = "proc_test", size = "small", - srcs = ["net_test.go"], + srcs = [ + "boot_test.go", + "net_test.go", + "tasks_test.go", + ], embed = [":proc"], deps = [ "//pkg/abi/linux", + "//pkg/cpuid", + "//pkg/fspath", + "//pkg/memutil", + "//pkg/sentry/context", "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/sched", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform/ptrace", + "//pkg/sentry/time", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/proc/boot_test.go b/pkg/sentry/fsimpl/proc/boot_test.go new file mode 100644 index 000000000..84a93ee56 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/boot_test.go @@ -0,0 +1,149 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "flag" + "fmt" + "os" + "runtime" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/time" + + // Platforms are plugable. + _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" +) + +var ( + platformFlag = flag.String("platform", "ptrace", "specify which platform to use") +) + +// boot initializes a new bare bones kernel for test. +func boot() (*kernel.Kernel, error) { + platformCtr, err := platform.Lookup(*platformFlag) + if err != nil { + return nil, fmt.Errorf("platform not found: %v", err) + } + deviceFile, err := platformCtr.OpenDevice() + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + plat, err := platformCtr.New(deviceFile) + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + + k := &kernel.Kernel{ + Platform: plat, + } + + mf, err := createMemoryFile() + if err != nil { + return nil, err + } + k.SetMemoryFile(mf) + + // Pass k as the platform since it is savable, unlike the actual platform. + vdso, err := loader.PrepareVDSO(nil, k) + if err != nil { + return nil, fmt.Errorf("creating vdso: %v", err) + } + + // Create timekeeper. + tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) + if err != nil { + return nil, fmt.Errorf("creating timekeeper: %v", err) + } + tk.SetClocks(time.NewCalibratedClocks()) + + creds := auth.NewRootCredentials(auth.NewRootUserNamespace()) + + // Initiate the Kernel object, which is required by the Context passed + // to createVFS in order to mount (among other things) procfs. + if err = k.Init(kernel.InitKernelArgs{ + ApplicationCores: uint(runtime.GOMAXPROCS(-1)), + FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, + RootUserNamespace: creds.UserNamespace, + Vdso: vdso, + RootUTSNamespace: kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace), + RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), + RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), + }); err != nil { + return nil, fmt.Errorf("initializing kernel: %v", err) + } + + ctx := k.SupervisorContext() + + // Create mount namespace without root as it's the minimum required to create + // the global thread group. + mntns, err := fs.NewMountNamespace(ctx, nil) + if err != nil { + return nil, err + } + ls, err := limits.NewLinuxLimitSet() + if err != nil { + return nil, err + } + tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) + k.TestOnly_SetGlobalInit(tg) + + return k, nil +} + +// createTask creates a new bare bones task for tests. +func createTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) { + k := kernel.KernelFromContext(ctx) + config := &kernel.TaskConfig{ + Kernel: k, + ThreadGroup: tc, + TaskContext: &kernel.TaskContext{Name: name}, + Credentials: auth.CredentialsFromContext(ctx), + AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), + UTSNamespace: kernel.UTSNamespaceFromContext(ctx), + IPCNamespace: kernel.IPCNamespaceFromContext(ctx), + AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + } + return k.TaskSet().NewTask(config) +} + +func createMemoryFile() (*pgalloc.MemoryFile, error) { + const memfileName = "test-memory" + memfd, err := memutil.CreateMemFD(memfileName, 0) + if err != nil { + return nil, fmt.Errorf("error creating memfd: %v", err) + } + memfile := os.NewFile(uintptr(memfd), memfileName) + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) + if err != nil { + memfile.Close() + return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) + } + return mf, nil +} diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go new file mode 100644 index 000000000..d09182c77 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -0,0 +1,69 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package proc implements a partial in-memory file system for procfs. +package proc + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// procFSType is the factory class for procfs. +// +// +stateify savable +type procFSType struct{} + +var _ vfs.FilesystemType = (*procFSType)(nil) + +// GetFilesystem implements vfs.FilesystemType. +func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + k := kernel.KernelFromContext(ctx) + if k == nil { + return nil, nil, fmt.Errorf("procfs requires a kernel") + } + pidns := kernel.PIDNamespaceFromContext(ctx) + if pidns == nil { + return nil, nil, fmt.Errorf("procfs requires a PID namespace") + } + + procfs := &kernfs.Filesystem{} + procfs.VFSFilesystem().Init(vfsObj, procfs) + + _, dentry := newTasksInode(procfs, k, pidns) + return procfs.VFSFilesystem(), dentry.VFSDentry(), nil +} + +// dynamicInode is an overfitted interface for common Inodes with +// dynamicByteSource types used in procfs. +type dynamicInode interface { + kernfs.Inode + vfs.DynamicBytesSource + + Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) +} + +func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { + inode.Init(creds, ino, inode, perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go index 9135afef1..5351d86e8 100644 --- a/pkg/sentry/fsimpl/proc/loadavg.go +++ b/pkg/sentry/fsimpl/proc/loadavg.go @@ -19,15 +19,17 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" ) // loadavgData backs /proc/loadavg. // // +stateify savable -type loadavgData struct{} +type loadavgData struct { + kernfs.DynamicBytesFile +} -var _ vfs.DynamicBytesSource = (*loadavgData)(nil) +var _ dynamicInode = (*loadavgData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go index 9a827cd66..cbdd4f3fc 100644 --- a/pkg/sentry/fsimpl/proc/meminfo.go +++ b/pkg/sentry/fsimpl/proc/meminfo.go @@ -19,21 +19,23 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/sentry/vfs" ) // meminfoData implements vfs.DynamicBytesSource for /proc/meminfo. // // +stateify savable type meminfoData struct { + kernfs.DynamicBytesFile + // k is the owning Kernel. k *kernel.Kernel } -var _ vfs.DynamicBytesSource = (*meminfoData)(nil) +var _ dynamicInode = (*meminfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/proc/mounts.go index e81b1e910..8683cf677 100644 --- a/pkg/sentry/fsimpl/proc/mounts.go +++ b/pkg/sentry/fsimpl/proc/mounts.go @@ -16,7 +16,7 @@ package proc import "gvisor.dev/gvisor/pkg/sentry/kernel" -// TODO(b/138862512): Implement mountInfoFile and mountsFile. +// TODO(gvisor.dev/issue/1195): Implement mountInfoFile and mountsFile. // mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo. // diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go index 720db3828..50894a534 100644 --- a/pkg/sentry/fsimpl/proc/stat.go +++ b/pkg/sentry/fsimpl/proc/stat.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/vfs" ) // cpuStats contains the breakdown of CPU time for /proc/stat. @@ -66,11 +66,13 @@ func (c cpuStats) String() string { // // +stateify savable type statData struct { + kernfs.DynamicBytesFile + // k is the owning Kernel. k *kernel.Kernel } -var _ vfs.DynamicBytesSource = (*statData)(nil) +var _ dynamicInode = (*statData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index c46e05c3a..11a64c777 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -15,247 +15,176 @@ package proc import ( - "bytes" - "fmt" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" ) -// mapsCommon is embedded by mapsData and smapsData. -type mapsCommon struct { - t *kernel.Task -} - -// mm gets the kernel task's MemoryManager. No additional reference is taken on -// mm here. This is safe because MemoryManager.destroy is required to leave the -// MemoryManager in a state where it's still usable as a DynamicBytesSource. -func (md *mapsCommon) mm() *mm.MemoryManager { - var tmm *mm.MemoryManager - md.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - tmm = mm - } - }) - return tmm -} - -// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. +// taskInode represents the inode for /proc/PID/ directory. // // +stateify savable -type mapsData struct { - mapsCommon +type taskInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNoDynamicLookup + kernfs.InodeAttrs + kernfs.OrderedChildren + + task *kernel.Task } -var _ vfs.DynamicBytesSource = (*mapsData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (md *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { - if mm := md.mm(); mm != nil { - mm.ReadMapsDataInto(ctx, buf) +var _ kernfs.Inode = (*taskInode)(nil) + +func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry { + contents := map[string]*kernfs.Dentry{ + //"auxv": newAuxvec(t, msrc), + //"cmdline": newExecArgInode(t, msrc, cmdlineExecArg), + //"comm": newComm(t, msrc), + //"environ": newExecArgInode(t, msrc, environExecArg), + //"exe": newExe(t, msrc), + //"fd": newFdDir(t, msrc), + //"fdinfo": newFdInfoDir(t, msrc), + //"gid_map": newGIDMap(t, msrc), + "io": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, newIO(task, isThreadGroup)), + "maps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &mapsData{task: task}), + //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), + //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), + //"ns": newNamespaceDir(t, msrc), + "smaps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &smapsData{task: task}), + "stat": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statmData{t: task}), + "status": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statusData{t: task, pidns: pidns}), + //"uid_map": newUIDMap(t, msrc), } - return nil -} - -// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. -// -// +stateify savable -type smapsData struct { - mapsCommon -} - -var _ vfs.DynamicBytesSource = (*smapsData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (sd *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { - if mm := sd.mm(); mm != nil { - mm.ReadSmapsDataInto(ctx, buf) + if isThreadGroup { + //contents["task"] = p.newSubtasks(t, msrc) } - return nil -} - -// +stateify savable -type taskStatData struct { - t *kernel.Task + //if len(p.cgroupControllers) > 0 { + // contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers) + //} - // If tgstats is true, accumulate fault stats (not implemented) and CPU - // time across all tasks in t's thread group. - tgstats bool + taskInode := &taskInode{task: task} + // Note: credentials are overridden by taskOwnedInode. + taskInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) - // pidns is the PID namespace associated with the proc filesystem that - // includes the file using this statData. - pidns *kernel.PIDNamespace -} - -var _ vfs.DynamicBytesSource = (*taskStatData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t)) - fmt.Fprintf(buf, "(%s) ", s.t.Name()) - fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0]) - ppid := kernel.ThreadID(0) - if parent := s.t.Parent(); parent != nil { - ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) - } - fmt.Fprintf(buf, "%d ", ppid) - fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup())) - fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session())) - fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) - fmt.Fprintf(buf, "0 " /* flags */) - fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) - var cputime usage.CPUStats - if s.tgstats { - cputime = s.t.ThreadGroup().CPUStats() - } else { - cputime = s.t.CPUStats() - } - fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) - cputime = s.t.ThreadGroup().JoinedChildCPUStats() - fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) - fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness()) - fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count()) + inode := &taskOwnedInode{Inode: taskInode, owner: task} + dentry := &kernfs.Dentry{} + dentry.Init(inode) - // itrealvalue. Since kernel 2.6.17, this field is no longer - // maintained, and is hard coded as 0. - fmt.Fprintf(buf, "0 ") + taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + links := taskInode.OrderedChildren.Populate(dentry, contents) + taskInode.IncLinks(links) - // Start time is relative to boot time, expressed in clock ticks. - fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime()))) + return dentry +} - var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) - fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize) +// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long +// as the task is still running. When it's dead, another tasks with the same +// PID could replace it. +func (i *taskInode) Valid(ctx context.Context) bool { + return i.task.ExitState() != kernel.TaskExitDead +} - // rsslim. - fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur) +// Open implements kernfs.Inode. +func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} - fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) - fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) - fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) - terminationSignal := linux.Signal(0) - if s.t == s.t.ThreadGroup().Leader() { - terminationSignal = s.t.ThreadGroup().TerminationSignal() +// SetStat implements kernfs.Inode. +func (i *taskInode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { + stat := opts.Stat + if stat.Mask&linux.STATX_MODE != 0 { + return syserror.EPERM } - fmt.Fprintf(buf, "%d ", terminationSignal) - fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) - fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) - fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) - fmt.Fprintf(buf, "0\n" /* exit_code */) - return nil } -// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. -// -// +stateify savable -type statmData struct { - t *kernel.Task +// taskOwnedInode implements kernfs.Inode and overrides inode owner with task +// effective user and group. +type taskOwnedInode struct { + kernfs.Inode + + // owner is the task that owns this inode. + owner *kernel.Task } -var _ vfs.DynamicBytesSource = (*statmData)(nil) +var _ kernfs.Inode = (*taskOwnedInode)(nil) -// Generate implements vfs.DynamicBytesSource.Generate. -func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { - var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) +func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { + // Note: credentials are overridden by taskOwnedInode. + inode.Init(task.Credentials(), ino, inode, perm) - fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize) - return nil + taskInode := &taskOwnedInode{Inode: inode, owner: task} + d := &kernfs.Dentry{} + d.Init(taskInode) + return d } -// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status. -// -// +stateify savable -type statusData struct { - t *kernel.Task - pidns *kernel.PIDNamespace +// Stat implements kernfs.Inode. +func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx { + stat := i.Inode.Stat(fs) + uid, gid := i.getOwner(linux.FileMode(stat.Mode)) + stat.UID = uint32(uid) + stat.GID = uint32(gid) + return stat } -var _ vfs.DynamicBytesSource = (*statusData)(nil) +// CheckPermissions implements kernfs.Inode. +func (i *taskOwnedInode) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + mode := i.Mode() + uid, gid := i.getOwner(mode) + return vfs.GenericCheckPermissions( + creds, + ats, + mode.FileType() == linux.ModeDirectory, + uint16(mode), + uid, + gid, + ) +} -// Generate implements vfs.DynamicBytesSource.Generate. -func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name()) - fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus()) - fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup())) - fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t)) - ppid := kernel.ThreadID(0) - if parent := s.t.Parent(); parent != nil { - ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) +func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) { + // By default, set the task owner as the file owner. + creds := i.owner.Credentials() + uid := creds.EffectiveKUID + gid := creds.EffectiveKGID + + // Linux doesn't apply dumpability adjustments to world readable/executable + // directories so that applications can stat /proc/PID to determine the + // effective UID of a process. See fs/proc/base.c:task_dump_owner. + if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 { + return uid, gid } - fmt.Fprintf(buf, "PPid:\t%d\n", ppid) - tpid := kernel.ThreadID(0) - if tracer := s.t.Tracer(); tracer != nil { - tpid = s.pidns.IDOfTask(tracer) + + // If the task is not dumpable, then root (in the namespace preferred) + // owns the file. + m := getMM(i.owner) + if m == nil { + return auth.RootKUID, auth.RootKGID } - fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) - var fds int - var vss, rss, data uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.Size() + if m.Dumpability() != mm.UserDumpable { + uid = auth.RootKUID + if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() { + uid = kuid } - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - data = mm.VirtualDataSize() + gid = auth.RootKGID + if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() { + gid = kgid } - }) - fmt.Fprintf(buf, "FDSize:\t%d\n", fds) - fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) - fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) - fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) - fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count()) - creds := s.t.Credentials() - fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) - fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) - fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) - fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) - fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode()) - return nil -} - -// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider. -type ioUsage interface { - // IOUsage returns the io usage data. - IOUsage() *usage.IO -} - -// +stateify savable -type ioData struct { - ioUsage + } + return uid, gid } -var _ vfs.DynamicBytesSource = (*ioData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { - io := usage.IO{} - io.Accumulate(i.IOUsage()) - - fmt.Fprintf(buf, "char: %d\n", io.CharsRead) - fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten) - fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls) - fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls) - fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead) - fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten) - fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) - return nil +func newIO(t *kernel.Task, isThreadGroup bool) *ioData { + if isThreadGroup { + return &ioData{ioUsage: t.ThreadGroup()} + } + return &ioData{ioUsage: t} } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go new file mode 100644 index 000000000..93f0e1aa8 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -0,0 +1,272 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// mm gets the kernel task's MemoryManager. No additional reference is taken on +// mm here. This is safe because MemoryManager.destroy is required to leave the +// MemoryManager in a state where it's still usable as a DynamicBytesSource. +func getMM(task *kernel.Task) *mm.MemoryManager { + var tmm *mm.MemoryManager + task.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + tmm = mm + } + }) + return tmm +} + +// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. +// +// +stateify savable +type mapsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mapsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if mm := getMM(d.task); mm != nil { + mm.ReadMapsDataInto(ctx, buf) + } + return nil +} + +// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. +// +// +stateify savable +type smapsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*smapsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if mm := getMM(d.task); mm != nil { + mm.ReadSmapsDataInto(ctx, buf) + } + return nil +} + +// +stateify savable +type taskStatData struct { + kernfs.DynamicBytesFile + + t *kernel.Task + + // If tgstats is true, accumulate fault stats (not implemented) and CPU + // time across all tasks in t's thread group. + tgstats bool + + // pidns is the PID namespace associated with the proc filesystem that + // includes the file using this statData. + pidns *kernel.PIDNamespace +} + +var _ dynamicInode = (*taskStatData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t)) + fmt.Fprintf(buf, "(%s) ", s.t.Name()) + fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0]) + ppid := kernel.ThreadID(0) + if parent := s.t.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(buf, "%d ", ppid) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup())) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session())) + fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) + fmt.Fprintf(buf, "0 " /* flags */) + fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) + var cputime usage.CPUStats + if s.tgstats { + cputime = s.t.ThreadGroup().CPUStats() + } else { + cputime = s.t.CPUStats() + } + fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + cputime = s.t.ThreadGroup().JoinedChildCPUStats() + fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness()) + fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count()) + + // itrealvalue. Since kernel 2.6.17, this field is no longer + // maintained, and is hard coded as 0. + fmt.Fprintf(buf, "0 ") + + // Start time is relative to boot time, expressed in clock ticks. + fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime()))) + + var vss, rss uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize) + + // rsslim. + fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur) + + fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) + fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) + fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) + terminationSignal := linux.Signal(0) + if s.t == s.t.ThreadGroup().Leader() { + terminationSignal = s.t.ThreadGroup().TerminationSignal() + } + fmt.Fprintf(buf, "%d ", terminationSignal) + fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) + fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) + fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) + fmt.Fprintf(buf, "0\n" /* exit_code */) + + return nil +} + +// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. +// +// +stateify savable +type statmData struct { + kernfs.DynamicBytesFile + + t *kernel.Task +} + +var _ dynamicInode = (*statmData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var vss, rss uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + + fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize) + return nil +} + +// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status. +// +// +stateify savable +type statusData struct { + kernfs.DynamicBytesFile + + t *kernel.Task + pidns *kernel.PIDNamespace +} + +var _ dynamicInode = (*statusData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name()) + fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus()) + fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup())) + fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t)) + ppid := kernel.ThreadID(0) + if parent := s.t.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(buf, "PPid:\t%d\n", ppid) + tpid := kernel.ThreadID(0) + if tracer := s.t.Tracer(); tracer != nil { + tpid = s.pidns.IDOfTask(tracer) + } + fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) + var fds int + var vss, rss, data uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.Size() + } + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + data = mm.VirtualDataSize() + } + }) + fmt.Fprintf(buf, "FDSize:\t%d\n", fds) + fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) + fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) + fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) + fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count()) + creds := s.t.Credentials() + fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) + fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) + fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) + fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) + fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + // We unconditionally report a single NUMA node. See + // pkg/sentry/syscalls/linux/sys_mempolicy.go. + fmt.Fprintf(buf, "Mems_allowed:\t1\n") + fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") + return nil +} + +// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider. +type ioUsage interface { + // IOUsage returns the io usage data. + IOUsage() *usage.IO +} + +// +stateify savable +type ioData struct { + kernfs.DynamicBytesFile + + ioUsage +} + +var _ dynamicInode = (*ioData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { + io := usage.IO{} + io.Accumulate(i.IOUsage()) + + fmt.Fprintf(buf, "char: %d\n", io.CharsRead) + fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten) + fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls) + fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls) + fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead) + fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten) + fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go new file mode 100644 index 000000000..d8f92d52f --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -0,0 +1,218 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "sort" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + defaultPermission = 0444 + selfName = "self" + threadSelfName = "thread-self" +) + +// InoGenerator generates unique inode numbers for a given filesystem. +type InoGenerator interface { + NextIno() uint64 +} + +// tasksInode represents the inode for /proc/ directory. +// +// +stateify savable +type tasksInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + + inoGen InoGenerator + pidns *kernel.PIDNamespace + + // '/proc/self' and '/proc/thread-self' have custom directory offsets in + // Linux. So handle them outside of OrderedChildren. + selfSymlink *vfs.Dentry + threadSelfSymlink *vfs.Dentry +} + +var _ kernfs.Inode = (*tasksInode)(nil) + +func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) { + root := auth.NewRootCredentials(pidns.UserNamespace()) + contents := map[string]*kernfs.Dentry{ + //"cpuinfo": newCPUInfo(ctx, msrc), + //"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc), + "loadavg": newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}), + "meminfo": newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}), + "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"), + "stat": newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}), + //"uptime": newUptime(ctx, msrc), + //"version": newVersionData(root, inoGen.NextIno(), k), + "version": newDentry(root, inoGen.NextIno(), defaultPermission, &versionData{k: k}), + } + + inode := &tasksInode{ + pidns: pidns, + inoGen: inoGen, + selfSymlink: newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), + threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), + } + inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + links := inode.OrderedChildren.Populate(dentry, contents) + inode.IncLinks(links) + + return inode, dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + // Try to lookup a corresponding task. + tid, err := strconv.ParseUint(name, 10, 64) + if err != nil { + // If it failed to parse, check if it's one of the special handled files. + switch name { + case selfName: + return i.selfSymlink, nil + case threadSelfName: + return i.threadSelfSymlink, nil + } + return nil, syserror.ENOENT + } + + task := i.pidns.TaskWithID(kernel.ThreadID(tid)) + if task == nil { + return nil, syserror.ENOENT + } + + taskDentry := newTaskInode(i.inoGen, task, i.pidns, true) + return taskDentry.VFSDentry(), nil +} + +// Valid implements kernfs.inodeDynamicLookup. +func (i *tasksInode) Valid(ctx context.Context) bool { + return true +} + +// IterDirents implements kernfs.inodeDynamicLookup. +func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { + // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256 + const FIRST_PROCESS_ENTRY = 256 + + // Use maxTaskID to shortcut searches that will result in 0 entries. + const maxTaskID = kernel.TasksLimit + 1 + if offset >= maxTaskID { + return offset, nil + } + + // According to Linux (fs/proc/base.c:proc_pid_readdir()), process directories + // start at offset FIRST_PROCESS_ENTRY with '/proc/self', followed by + // '/proc/thread-self' and then '/proc/[pid]'. + if offset < FIRST_PROCESS_ENTRY { + offset = FIRST_PROCESS_ENTRY + } + + if offset == FIRST_PROCESS_ENTRY { + dirent := vfs.Dirent{ + Name: selfName, + Type: linux.DT_LNK, + Ino: i.inoGen.NextIno(), + NextOff: offset + 1, + } + if !cb.Handle(dirent) { + return offset, nil + } + offset++ + } + if offset == FIRST_PROCESS_ENTRY+1 { + dirent := vfs.Dirent{ + Name: threadSelfName, + Type: linux.DT_LNK, + Ino: i.inoGen.NextIno(), + NextOff: offset + 1, + } + if !cb.Handle(dirent) { + return offset, nil + } + offset++ + } + + // Collect all tasks that TGIDs are greater than the offset specified. Per + // Linux we only include in directory listings if it's the leader. But for + // whatever crazy reason, you can still walk to the given node. + var tids []int + startTid := offset - FIRST_PROCESS_ENTRY - 2 + for _, tg := range i.pidns.ThreadGroups() { + tid := i.pidns.IDOfThreadGroup(tg) + if int64(tid) < startTid { + continue + } + if leader := tg.Leader(); leader != nil { + tids = append(tids, int(tid)) + } + } + + if len(tids) == 0 { + return offset, nil + } + + sort.Ints(tids) + for _, tid := range tids { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(tid), 10), + Type: linux.DT_DIR, + Ino: i.inoGen.NextIno(), + NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1, + } + if !cb.Handle(dirent) { + return offset, nil + } + offset++ + } + return maxTaskID, nil +} + +// Open implements kernfs.Inode. +func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx { + stat := i.InodeAttrs.Stat(vsfs) + + // Add dynamic children to link count. + for _, tg := range i.pidns.ThreadGroups() { + if leader := tg.Leader(); leader != nil { + stat.Nlink++ + } + } + + return stat +} diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go new file mode 100644 index 000000000..91f30a798 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -0,0 +1,92 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +type selfSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + pidns *kernel.PIDNamespace +} + +var _ kernfs.Inode = (*selfSymlink)(nil) + +func newSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { + inode := &selfSymlink{pidns: pidns} + inode.Init(creds, ino, linux.ModeSymlink|perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // Who is reading this link? + return "", syserror.EINVAL + } + tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) + if tgid == 0 { + return "", syserror.ENOENT + } + return strconv.FormatUint(uint64(tgid), 10), nil +} + +type threadSelfSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + pidns *kernel.PIDNamespace +} + +var _ kernfs.Inode = (*threadSelfSymlink)(nil) + +func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { + inode := &threadSelfSymlink{pidns: pidns} + inode.Init(creds, ino, linux.ModeSymlink|perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // Who is reading this link? + return "", syserror.EINVAL + } + tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) + tid := s.pidns.IDOfTask(t) + if tid == 0 || tgid == 0 { + return "", syserror.ENOENT + } + return fmt.Sprintf("%d/task/%d", tgid, tid), nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go new file mode 100644 index 000000000..ca8c87ec2 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -0,0 +1,555 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "math" + "path" + "strconv" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +var ( + // Next offset 256 by convention. Adds 1 for the next offset. + selfLink = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 0 + 1} + threadSelfLink = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 1 + 1} + + // /proc/[pid] next offset starts at 256+2 (files above), then adds the + // PID, and adds 1 for the next offset. + proc1 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 1 + 1} + proc2 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 2 + 1} + proc3 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 3 + 1} +) + +type testIterDirentsCallback struct { + dirents []vfs.Dirent +} + +func (t *testIterDirentsCallback) Handle(d vfs.Dirent) bool { + t.dirents = append(t.dirents, d) + return true +} + +func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) { + if got := len(dirs); got < 2 { + return dirs, fmt.Errorf("wrong number of dirents, want at least: 2, got: %d: %v", got, dirs) + } + for i, want := range []string{".", ".."} { + if got := dirs[i].Name; got != want { + return dirs, fmt.Errorf("wrong name, want: %s, got: %s", want, got) + } + if got := dirs[i].Type; got != linux.DT_DIR { + return dirs, fmt.Errorf("wrong type, want: %d, got: %d", linux.DT_DIR, got) + } + } + return dirs[2:], nil +} + +func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { + wants := map[string]vfs.Dirent{ + "loadavg": {Type: linux.DT_REG}, + "meminfo": {Type: linux.DT_REG}, + "mounts": {Type: linux.DT_LNK}, + "self": selfLink, + "stat": {Type: linux.DT_REG}, + "thread-self": threadSelfLink, + "version": {Type: linux.DT_REG}, + } + return checkFiles(gots, wants) +} + +func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { + wants := map[string]vfs.Dirent{ + "io": {Type: linux.DT_REG}, + "maps": {Type: linux.DT_REG}, + "smaps": {Type: linux.DT_REG}, + "stat": {Type: linux.DT_REG}, + "statm": {Type: linux.DT_REG}, + "status": {Type: linux.DT_REG}, + } + return checkFiles(gots, wants) +} + +func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, error) { + // Go over all files, when there is a match, the file is removed from both + // 'gots' and 'wants'. wants is expected to reach 0, as all files must + // be present. Remaining files in 'gots', is returned to caller to decide + // whether this is valid or not. + for i := 0; i < len(gots); i++ { + got := gots[i] + want, ok := wants[got.Name] + if !ok { + continue + } + if want.Type != got.Type { + return gots, fmt.Errorf("wrong file type, want: %v, got: %v: %+v", want.Type, got.Type, got) + } + if want.NextOff != 0 && want.NextOff != got.NextOff { + return gots, fmt.Errorf("wrong dirent offset, want: %v, got: %v: %+v", want.NextOff, got.NextOff, got) + } + + delete(wants, got.Name) + gots = append(gots[0:i], gots[i+1:]...) + i-- + } + if len(wants) != 0 { + return gots, fmt.Errorf("not all files were found, missing: %+v", wants) + } + return gots, nil +} + +func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) { + k, err := boot() + if err != nil { + return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("creating kernel: %v", err) + } + + ctx := k.SupervisorContext() + creds := auth.CredentialsFromContext(ctx) + + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{}) + if err != nil { + return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err) + } + return ctx, vfsObj, mntns.Root(), nil +} + +func TestTasksEmpty(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTasksStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + if len(cb.dirents) != 0 { + t.Errorf("found more files than expected: %+v", cb.dirents) + } +} + +func TestTasks(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(ctx) + var tasks []*kernel.Task + for i := 0; i < 5; i++ { + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + tasks = append(tasks, task) + } + + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTasksStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + lastPid := 0 + for _, d := range cb.dirents { + pid, err := strconv.Atoi(d.Name) + if err != nil { + t.Fatalf("Invalid process directory %q", d.Name) + } + if lastPid > pid { + t.Errorf("pids not in order: %v", cb.dirents) + } + found := false + for _, t := range tasks { + if k.TaskSet().Root.IDOfTask(t) == kernel.ThreadID(pid) { + found = true + } + } + if !found { + t.Errorf("Additional task ID %d listed: %v", pid, tasks) + } + // Next offset starts at 256+2 ('self' and 'thread-self'), then adds the + // PID, and adds 1 for the next offset. + if want := int64(256 + 2 + pid + 1); d.NextOff != want { + t.Errorf("Wrong dirent offset want: %d got: %d: %+v", want, d.NextOff, d) + } + } + + // Test lookup. + for _, path := range []string{"/1", "/2"} { + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(path)}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err) + } + buf := make([]byte, 1) + bufIOSeq := usermem.BytesIOSequence(buf) + if _, err := fd.Read(ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { + t.Errorf("wrong error reading directory: %v", err) + } + } + + if _, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/9999")}, + &vfs.OpenOptions{}, + ); err != syserror.ENOENT { + t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err) + } +} + +func TestTasksOffset(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(ctx) + for i := 0; i < 3; i++ { + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + if _, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc); err != nil { + t.Fatalf("CreateTask(): %v", err) + } + } + + for _, tc := range []struct { + name string + offset int64 + wants map[string]vfs.Dirent + }{ + { + name: "small offset", + offset: 100, + wants: map[string]vfs.Dirent{ + "self": selfLink, + "thread-self": threadSelfLink, + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "offset at start", + offset: 256, + wants: map[string]vfs.Dirent{ + "self": selfLink, + "thread-self": threadSelfLink, + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "skip /proc/self", + offset: 257, + wants: map[string]vfs.Dirent{ + "thread-self": threadSelfLink, + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "skip symlinks", + offset: 258, + wants: map[string]vfs.Dirent{ + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "skip first process", + offset: 260, + wants: map[string]vfs.Dirent{ + "2": proc2, + "3": proc3, + }, + }, + { + name: "last process", + offset: 261, + wants: map[string]vfs.Dirent{ + "3": proc3, + }, + }, + { + name: "after last", + offset: 262, + wants: nil, + }, + { + name: "TaskLimit+1", + offset: kernel.TasksLimit + 1, + wants: nil, + }, + { + name: "max", + offset: math.MaxInt64, + wants: nil, + }, + } { + t.Run(tc.name, func(t *testing.T) { + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + } + if _, err := fd.Impl().Seek(ctx, tc.offset, linux.SEEK_SET); err != nil { + t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + if cb.dirents, err = checkFiles(cb.dirents, tc.wants); err != nil { + t.Error(err.Error()) + } + if len(cb.dirents) != 0 { + t.Errorf("found more files than expected: %+v", cb.dirents) + } + }) + } +} + +func TestTask(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(ctx) + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + _, err = createTask(ctx, "name", tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/1")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/1) failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTaskStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + if len(cb.dirents) != 0 { + t.Errorf("found more files than expected: %+v", cb.dirents) + } +} + +func TestProcSelf(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(ctx) + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := createTask(ctx, "name", tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + + fd, err := vfsObj.OpenAt( + task, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/self/"), FollowFinalSymlink: true}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/self/) failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTaskStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + if len(cb.dirents) != 0 { + t.Errorf("found more files than expected: %+v", cb.dirents) + } +} + +func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, fd *vfs.FileDescription) { + t.Logf("Iterating: /proc%s", fd.MappedName(ctx)) + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + var err error + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + for _, d := range cb.dirents { + childPath := path.Join(fd.MappedName(ctx), d.Name) + if d.Type == linux.DT_LNK { + link, err := vfsObj.ReadlinkAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)}, + ) + if err != nil { + t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err) + } else { + t.Logf("Skipping symlink: /proc%s => %s", childPath, link) + } + continue + } + + t.Logf("Opening: /proc%s", childPath) + child, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err) + continue + } + stat, err := child.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Errorf("Stat(%v) failed: %v", childPath, err) + } + if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type { + t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type) + } + if d.Type == linux.DT_DIR { + // Found another dir, let's do it again! + iterateDir(ctx, t, vfsObj, root, child) + } + } +} + +// TestTree iterates all directories and stats every file. +func TestTree(t *testing.T) { + uberCtx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(uberCtx) + var tasks []*kernel.Task + for i := 0; i < 5; i++ { + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := createTask(uberCtx, fmt.Sprintf("name-%d", i), tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + tasks = append(tasks, task) + } + + ctx := tasks[0] + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(uberCtx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + } + iterateDir(ctx, t, vfsObj, root, fd) +} diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go index e1643d4e0..367f2396b 100644 --- a/pkg/sentry/fsimpl/proc/version.go +++ b/pkg/sentry/fsimpl/proc/version.go @@ -19,19 +19,21 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/vfs" ) // versionData implements vfs.DynamicBytesSource for /proc/version. // // +stateify savable type versionData struct { + kernfs.DynamicBytesFile + // k is the owning Kernel. k *kernel.Kernel } -var _ vfs.DynamicBytesSource = (*versionData)(nil) +var _ dynamicInode = (*versionData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index bc5c0b591..a5b285987 100644 --- a/pkg/sentry/fsimpl/memfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -1,14 +1,13 @@ load("//tools/go_stateify:defs.bzl", "go_library") load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) -load("//tools/go_generics:defs.bzl", "go_template_instance") - go_template_instance( name = "dentry_list", out = "dentry_list.go", - package = "memfs", + package = "tmpfs", prefix = "dentry", template = "//pkg/ilist:generic_list", types = { @@ -18,24 +17,34 @@ go_template_instance( ) go_library( - name = "memfs", + name = "tmpfs", srcs = [ "dentry_list.go", "directory.go", "filesystem.go", - "memfs.go", "named_pipe.go", "regular_file.go", "symlink.go", + "tmpfs.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs", + importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs", deps = [ "//pkg/abi/linux", "//pkg/amutex", + "//pkg/fspath", + "//pkg/log", "//pkg/sentry/arch", "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", + "//pkg/sentry/memmap", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/safemem", + "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", @@ -47,8 +56,9 @@ go_test( size = "small", srcs = ["benchmark_test.go"], deps = [ - ":memfs", + ":tmpfs", "//pkg/abi/linux", + "//pkg/fspath", "//pkg/refs", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", @@ -61,15 +71,20 @@ go_test( ) go_test( - name = "memfs_test", + name = "tmpfs_test", size = "small", - srcs = ["pipe_test.go"], - embed = [":memfs"], + srcs = [ + "pipe_test.go", + "regular_file_test.go", + ], + embed = [":tmpfs"], deps = [ "//pkg/abi/linux", + "//pkg/fspath", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/contexttest", "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index ea6417ce7..d88c83499 100644 --- a/pkg/sentry/fsimpl/memfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -21,12 +21,13 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -175,8 +176,10 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { // Create VFS. vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) + vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } @@ -193,9 +196,9 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) pop := vfs.PathOperation{ - Root: root, - Start: vd, - Pathname: name, + Root: root, + Start: vd, + Path: fspath.Parse(name), } if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ Mode: 0755, @@ -216,7 +219,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ Root: root, Start: vd, - Pathname: filename, + Path: fspath.Parse(filename), FollowFinalSymlink: true, }, &vfs.OpenOptions{ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, @@ -237,7 +240,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ Root: root, Start: root, - Pathname: filePath, + Path: fspath.Parse(filePath), FollowFinalSymlink: true, }, &vfs.StatOptions{}) if err != nil { @@ -364,8 +367,10 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { // Create VFS. vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) + vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } @@ -378,9 +383,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { root := mntns.Root() defer root.DecRef() pop := vfs.PathOperation{ - Root: root, - Start: root, - Pathname: mountPointName, + Root: root, + Start: root, + Path: fspath.Parse(mountPointName), } if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ Mode: 0755, @@ -394,7 +399,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { } defer mountPoint.DecRef() // Create and mount the submount. - if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.GetFilesystemOptions{}); err != nil { + if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) } filePathBuilder.WriteString(mountPointName) @@ -408,9 +413,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) pop := vfs.PathOperation{ - Root: root, - Start: vd, - Pathname: name, + Root: root, + Start: vd, + Path: fspath.Parse(name), } if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ Mode: 0755, @@ -438,7 +443,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ Root: root, Start: vd, - Pathname: filename, + Path: fspath.Parse(filename), FollowFinalSymlink: true, }, &vfs.OpenOptions{ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, @@ -458,7 +463,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ Root: root, Start: root, - Pathname: filePath, + Path: fspath.Parse(filePath), FollowFinalSymlink: true, }, &vfs.StatOptions{}) if err != nil { diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 0bd82e480..887ca2619 100644 --- a/pkg/sentry/fsimpl/memfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package memfs +package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go new file mode 100644 index 000000000..26979729e --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -0,0 +1,698 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // All filesystem state is in-memory. + return nil +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// stepLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: filesystem.mu must be locked. !rp.Done(). +func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { + if !d.inode.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } +afterSymlink: + nextVFSD, err := rp.ResolveComponent(&d.vfsd) + if err != nil { + return nil, err + } + if nextVFSD == nil { + // Since the Dentry tree is the sole source of truth for tmpfs, if it's + // not in the Dentry tree, it doesn't exist. + return nil, syserror.ENOENT + } + next := nextVFSD.Impl().(*dentry) + if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // TODO: symlink traversals update access time + if err := rp.HandleSymlink(symlink.target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return next, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// walkParentDirLocked is loosely analogous to Linux's +// fs/namei.c:path_parentat(). +// +// Preconditions: filesystem.mu must be locked. !rp.Done(). +func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { + for !rp.Final() { + next, err := stepLocked(rp, d) + if err != nil { + return nil, err + } + d = next + } + if !d.inode.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// resolveLocked resolves rp to an existing file. +// +// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). +// +// Preconditions: filesystem.mu must be locked. +func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + for !rp.Done() { + next, err := stepLocked(rp, d) + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.inode.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// doCreateAt checks that creating a file at rp is permitted, then invokes +// create to do so. +// +// doCreateAt is loosely analogous to a conjunction of Linux's +// fs/namei.c:filename_create() and done_path_create(). +// +// Preconditions: !rp.Done(). For the final path component in rp, +// !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { + fs.mu.Lock() + defer fs.mu.Unlock() + parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EEXIST + } + // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(), + // because if the child exists we want to return EEXIST immediately instead + // of attempting symlink/mount traversal. + if parent.vfsd.Child(name) != nil { + return syserror.EEXIST + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } + // In memfs, the only way to cause a dentry to be disowned is by removing + // it from the filesystem, so this check is equivalent to checking if + // parent has been removed. + if parent.vfsd.IsDisowned() { + return syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + return create(parent, name) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.inode.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + d := vd.Dentry().Impl().(*dentry) + if d.inode.isDir() { + return syserror.EPERM + } + if d.inode.nlink == 0 { + return syserror.ENOENT + } + if d.inode.nlink == maxLinks { + return syserror.EMLINK + } + d.inode.incLinksLocked() + child := fs.newDentry(d.inode) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + }) +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error { + if parent.inode.nlink == maxLinks { + return syserror.EMLINK + } + parent.inode.incLinksLocked() // from child's ".." + child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + }) +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + switch opts.Mode.FileType() { + case 0, linux.S_IFREG: + child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + case linux.S_IFIFO: + child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK: + // Not yet supported. + return syserror.EPERM + default: + return syserror.EINVAL + } + }) +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + if opts.Flags&linux.O_TMPFILE != 0 { + // Not yet supported. + return nil, syserror.EOPNOTSUPP + } + + // Handle O_CREAT and !O_CREAT separately, since in the latter case we + // don't need fs.mu for writing. + if opts.Flags&linux.O_CREAT == 0 { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return nil, err + } + return d.open(ctx, rp, opts.Flags, false /* afterCreate */) + } + + mustCreate := opts.Flags&linux.O_EXCL != 0 + start := rp.Start().Impl().(*dentry) + fs.mu.Lock() + defer fs.mu.Unlock() + if rp.Done() { + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } + return start.open(ctx, rp, opts.Flags, false /* afterCreate */) + } +afterTrailingSymlink: + parent, err := walkParentDirLocked(rp, start) + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + name := rp.Component() + if name == "." || name == ".." { + return nil, syserror.EISDIR + } + // Determine whether or not we need to create a file. + child, err := stepLocked(rp, parent) + if err == syserror.ENOENT { + // Already checked for searchability above; now check for writability. + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + return nil, err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + defer rp.Mount().EndWrite() + // Create and open the child. + child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return child.open(ctx, rp, opts.Flags, true) + } + if err != nil { + return nil, err + } + // Do we need to resolve a trailing symlink? + if !rp.Done() { + start = parent + goto afterTrailingSymlink + } + // Open existing file. + if mustCreate { + return nil, syserror.EEXIST + } + return child.open(ctx, rp, opts.Flags, false) +} + +func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(flags) + if !afterCreate { + if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil { + return nil, err + } + } + mnt := rp.Mount() + switch impl := d.inode.impl.(type) { + case *regularFile: + var fd regularFileFD + fd.readable = vfs.MayReadFileWithOpenFlags(flags) + fd.writable = vfs.MayWriteFileWithOpenFlags(flags) + if fd.writable { + if err := mnt.CheckBeginWrite(); err != nil { + return nil, err + } + // mnt.EndWrite() is called by regularFileFD.Release(). + } + fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) + if flags&linux.O_TRUNC != 0 { + impl.mu.Lock() + impl.data.Truncate(0, impl.memFile) + atomic.StoreUint64(&impl.size, 0) + impl.mu.Unlock() + } + return &fd.vfsfd, nil + case *directory: + // Can't open directories writably. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + var fd directoryFD + fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) + return &fd.vfsfd, nil + case *symlink: + // Can't open symlinks without O_PATH (which is unimplemented). + return nil, syserror.ELOOP + case *namedPipe: + return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags) + default: + panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) + } +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return "", err + } + symlink, ok := d.inode.impl.(*symlink) + if !ok { + return "", syserror.EINVAL + } + return symlink.target, nil +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if opts.Flags != 0 { + // TODO(b/145974740): Support renameat2 flags. + return syserror.EINVAL + } + + // Resolve newParent first to verify that it's on this Mount. + fs.mu.Lock() + defer fs.mu.Unlock() + newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + newName := rp.Component() + if newName == "." || newName == ".." { + return syserror.EBUSY + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + oldParent := oldParentVD.Dentry().Impl().(*dentry) + if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err + } + // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(), + // because if the existing child is a symlink or mount point then we want + // to rename over it rather than follow it. + renamedVFSD := oldParent.vfsd.Child(oldName) + if renamedVFSD == nil { + return syserror.ENOENT + } + renamed := renamedVFSD.Impl().(*dentry) + if renamed.inode.isDir() { + if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) { + return syserror.EINVAL + } + if oldParent != newParent { + // Writability is needed to change renamed's "..". + if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil { + return err + } + } + } else { + if opts.MustBeDir || rp.MustBeDir() { + return syserror.ENOTDIR + } + } + + if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err + } + replacedVFSD := newParent.vfsd.Child(newName) + var replaced *dentry + if replacedVFSD != nil { + replaced = replacedVFSD.Impl().(*dentry) + if replaced.inode.isDir() { + if !renamed.inode.isDir() { + return syserror.EISDIR + } + if replaced.vfsd.HasChildren() { + return syserror.ENOTEMPTY + } + } else { + if rp.MustBeDir() { + return syserror.ENOTDIR + } + if renamed.inode.isDir() { + return syserror.ENOTDIR + } + } + } else { + if renamed.inode.isDir() && newParent.inode.nlink == maxLinks { + return syserror.EMLINK + } + } + if newParent.vfsd.IsDisowned() { + return syserror.ENOENT + } + + // Linux places this check before some of those above; we do it here for + // simplicity, under the assumption that applications are not intentionally + // doing noop renames expecting them to succeed where non-noop renames + // would fail. + if renamedVFSD == replacedVFSD { + return nil + } + vfsObj := rp.VirtualFilesystem() + oldParentDir := oldParent.inode.impl.(*directory) + newParentDir := newParent.inode.impl.(*directory) + if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil { + return err + } + if replaced != nil { + newParentDir.childList.Remove(replaced) + if replaced.inode.isDir() { + newParent.inode.decLinksLocked() // from replaced's ".." + } + replaced.inode.decLinksLocked() + } + oldParentDir.childList.Remove(renamed) + newParentDir.childList.PushBack(renamed) + if renamed.inode.isDir() { + oldParent.inode.decLinksLocked() + newParent.inode.incLinksLocked() + } + // TODO: update timestamps and parent directory sizes + vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err + } + name := rp.Component() + if name == "." { + return syserror.EINVAL + } + if name == ".." { + return syserror.ENOTEMPTY + } + childVFSD := parent.vfsd.Child(name) + if childVFSD == nil { + return syserror.ENOENT + } + child := childVFSD.Impl().(*dentry) + if !child.inode.isDir() { + return syserror.ENOTDIR + } + if childVFSD.HasChildren() { + return syserror.ENOTEMPTY + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + vfsObj := rp.VirtualFilesystem() + if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { + return err + } + parent.inode.impl.(*directory).childList.Remove(child) + parent.inode.decLinksLocked() // from child's ".." + child.inode.decLinksLocked() + vfsObj.CommitDeleteDentry(childVFSD) + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return err + } + if opts.Stat.Mask == 0 { + return nil + } + // TODO: implement inode.setStat + return syserror.EPERM +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return linux.Statx{}, err + } + var stat linux.Statx + d.inode.statTo(&stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return linux.Statfs{}, err + } + // TODO: actually implement statfs + return linux.Statfs{}, syserror.ENOSYS +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + }) +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EISDIR + } + childVFSD := parent.vfsd.Child(name) + if childVFSD == nil { + return syserror.ENOENT + } + child := childVFSD.Impl().(*dentry) + if child.inode.isDir() { + return syserror.EISDIR + } + if !rp.MustBeDir() { + return syserror.ENOTDIR + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + vfsObj := rp.VirtualFilesystem() + if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { + return err + } + parent.inode.impl.(*directory).childList.Remove(child) + child.inode.decLinksLocked() + vfsObj.CommitDeleteDentry(childVFSD) + return nil +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return nil, err + } + // TODO(b/127675828): support extended attributes + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return "", err + } + // TODO(b/127675828): support extended attributes + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return err + } + // TODO(b/127675828): support extended attributes + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return err + } + // TODO(b/127675828): support extended attributes + return syserror.ENOTSUP +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 91cb4b1fc..40bde54de 100644 --- a/pkg/sentry/fsimpl/memfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package memfs +package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" @@ -55,8 +55,6 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v return nil, err } mnt := rp.Mount() - mnt.IncRef() - vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil } diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index a3a870571..70b42a6ec 100644 --- a/pkg/sentry/fsimpl/memfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -12,13 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -package memfs +package tmpfs import ( "bytes" "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -38,7 +39,7 @@ func TestSeparateFDs(t *testing.T) { pop := vfs.PathOperation{ Root: root, Start: root, - Pathname: fileName, + Path: fspath.Parse(fileName), FollowFinalSymlink: true, } rfdchan := make(chan *vfs.FileDescription) @@ -76,7 +77,7 @@ func TestNonblockingRead(t *testing.T) { pop := vfs.PathOperation{ Root: root, Start: root, - Pathname: fileName, + Path: fspath.Parse(fileName), FollowFinalSymlink: true, } openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK} @@ -108,7 +109,7 @@ func TestNonblockingWriteError(t *testing.T) { pop := vfs.PathOperation{ Root: root, Start: root, - Pathname: fileName, + Path: fspath.Parse(fileName), FollowFinalSymlink: true, } openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK} @@ -126,7 +127,7 @@ func TestSingleFD(t *testing.T) { pop := vfs.PathOperation{ Root: root, Start: root, - Pathname: fileName, + Path: fspath.Parse(fileName), FollowFinalSymlink: true, } openOpts := vfs.OpenOptions{Flags: linux.O_RDWR} @@ -151,8 +152,10 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy // Create VFS. vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } @@ -160,10 +163,9 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy // Create the pipe. root := mntns.Root() pop := vfs.PathOperation{ - Root: root, - Start: root, - Pathname: fileName, - FollowFinalSymlink: true, + Root: root, + Start: root, + Path: fspath.Parse(fileName), } mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644} if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil { @@ -174,7 +176,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ Root: root, Start: root, - Pathname: fileName, + Path: fspath.Parse(fileName), FollowFinalSymlink: true, }, &vfs.StatOptions{}) if err != nil { @@ -194,7 +196,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { readData := make([]byte, 1) dst := usermem.BytesIOSequence(readData) - bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{}) + bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) if err != syserror.ErrWouldBlock { t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err) } @@ -207,7 +209,7 @@ func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { writeData := []byte(msg) src := usermem.BytesIOSequence(writeData) - bytesWritten, err := fd.Impl().Write(ctx, src, vfs.WriteOptions{}) + bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{}) if err != nil { t.Fatalf("error writing to pipe %q: %v", fileName, err) } @@ -220,7 +222,7 @@ func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { readData := make([]byte, len(msg)) dst := usermem.BytesIOSequence(readData) - bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{}) + bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) if err != nil { t.Fatalf("error reading from pipe %q: %v", fileName, err) } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go new file mode 100644 index 000000000..f51e247a7 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -0,0 +1,357 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "io" + "math" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +type regularFile struct { + inode inode + + // memFile is a platform.File used to allocate pages to this regularFile. + memFile *pgalloc.MemoryFile + + // mu protects the fields below. + mu sync.RWMutex + + // data maps offsets into the file to offsets into memFile that store + // the file's data. + data fsutil.FileRangeSet + + // size is the size of data, but accessed using atomic memory + // operations to avoid locking in inode.stat(). + size uint64 + + // seals represents file seals on this inode. + seals uint32 +} + +func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { + file := ®ularFile{ + memFile: fs.memFile, + } + file.inode.init(file, fs, creds, mode) + file.inode.nlink = 1 // from parent directory + return &file.inode +} + +type regularFileFD struct { + fileDescription + + // These are immutable. + readable bool + writable bool + + // off is the file offset. off is accessed using atomic memory operations. + // offMu serializes operations that may mutate off. + off int64 + offMu sync.Mutex +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() { + if fd.writable { + fd.vfsfd.VirtualDentry().Mount().EndWrite() + } +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + if dst.NumBytes() == 0 { + return 0, nil + } + f := fd.inode().impl.(*regularFile) + rw := getRegularFileReadWriter(f, offset) + n, err := dst.CopyOutFrom(ctx, rw) + putRegularFileReadWriter(rw) + return int64(n), err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + srclen := src.NumBytes() + if srclen == 0 { + return 0, nil + } + f := fd.inode().impl.(*regularFile) + end := offset + srclen + if end < offset { + // Overflow. + return 0, syserror.EFBIG + } + rw := getRegularFileReadWriter(f, offset) + n, err := src.CopyInTo(ctx, rw) + putRegularFileReadWriter(rw) + return n, err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.offMu.Lock() + defer fd.offMu.Unlock() + switch whence { + case linux.SEEK_SET: + // use offset as specified + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END: + offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size)) + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *regularFileFD) Sync(ctx context.Context) error { + return nil +} + +// regularFileReadWriter implements safemem.Reader and Safemem.Writer. +type regularFileReadWriter struct { + file *regularFile + + // Offset into the file to read/write at. Note that this may be + // different from the FD offset if PRead/PWrite is used. + off uint64 +} + +var regularFileReadWriterPool = sync.Pool{ + New: func() interface{} { + return ®ularFileReadWriter{} + }, +} + +func getRegularFileReadWriter(file *regularFile, offset int64) *regularFileReadWriter { + rw := regularFileReadWriterPool.Get().(*regularFileReadWriter) + rw.file = file + rw.off = uint64(offset) + return rw +} + +func putRegularFileReadWriter(rw *regularFileReadWriter) { + rw.file = nil + regularFileReadWriterPool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + rw.file.mu.RLock() + + // Compute the range to read (limited by file size and overflow-checked). + if rw.off >= rw.file.size { + rw.file.mu.RUnlock() + return 0, io.EOF + } + end := rw.file.size + if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { + end = rend + } + + var done uint64 + seg, gap := rw.file.data.Find(uint64(rw.off)) + for rw.off < end { + mr := memmap.MappableRange{uint64(rw.off), uint64(end)} + switch { + case seg.Ok(): + // Get internal mappings. + ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) + if err != nil { + rw.file.mu.RUnlock() + return done, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, ims) + done += n + rw.off += uint64(n) + dsts = dsts.DropFirst64(n) + if err != nil { + rw.file.mu.RUnlock() + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Tmpfs holes are zero-filled. + gapmr := gap.Range().Intersect(mr) + dst := dsts.TakeFirst64(gapmr.Length()) + n, err := safemem.ZeroSeq(dst) + done += n + rw.off += uint64(n) + dsts = dsts.DropFirst64(n) + if err != nil { + rw.file.mu.RUnlock() + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + } + } + rw.file.mu.RUnlock() + return done, nil +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + rw.file.mu.Lock() + + // Compute the range to write (overflow-checked). + end := rw.off + srcs.NumBytes() + if end <= rw.off { + end = math.MaxInt64 + } + + // Check if seals prevent either file growth or all writes. + switch { + case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed + rw.file.mu.Unlock() + return 0, syserror.EPERM + case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed + // When growth is sealed, Linux effectively allows writes which would + // normally grow the file to partially succeed up to the current EOF, + // rounded down to the page boundary before the EOF. + // + // This happens because writes (and thus the growth check) for tmpfs + // files proceed page-by-page on Linux, and the final write to the page + // containing EOF fails, resulting in a partial write up to the start of + // that page. + // + // To emulate this behaviour, artifically truncate the write to the + // start of the page containing the current EOF. + // + // See Linux, mm/filemap.c:generic_perform_write() and + // mm/shmem.c:shmem_write_begin(). + if pgstart := uint64(usermem.Addr(rw.file.size).RoundDown()); end > pgstart { + end = pgstart + } + if end <= rw.off { + // Truncation would result in no data being written. + rw.file.mu.Unlock() + return 0, syserror.EPERM + } + } + + // Page-aligned mr for when we need to allocate memory. RoundUp can't + // overflow since end is an int64. + pgstartaddr := usermem.Addr(rw.off).RoundDown() + pgendaddr, _ := usermem.Addr(end).RoundUp() + pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)} + + var ( + done uint64 + retErr error + ) + seg, gap := rw.file.data.Find(uint64(rw.off)) + for rw.off < end { + mr := memmap.MappableRange{uint64(rw.off), uint64(end)} + switch { + case seg.Ok(): + // Get internal mappings. + ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write) + if err != nil { + retErr = err + goto exitLoop + } + + // Copy to internal mappings. + n, err := safemem.CopySeq(ims, srcs) + done += n + rw.off += uint64(n) + srcs = srcs.DropFirst64(n) + if err != nil { + retErr = err + goto exitLoop + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Allocate memory for the write. + gapMR := gap.Range().Intersect(pgMR) + fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs) + if err != nil { + retErr = err + goto exitLoop + } + + // Write to that memory as usual. + seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{} + } + } +exitLoop: + // If the write ends beyond the file's previous size, it causes the + // file to grow. + if rw.off > rw.file.size { + atomic.StoreUint64(&rw.file.size, rw.off) + } + + rw.file.mu.Unlock() + return done, retErr +} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go new file mode 100644 index 000000000..3731c5b6f --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -0,0 +1,224 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "bytes" + "fmt" + "io" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If +// the returned err is not nil, then cleanup should be called when the FD is no +// longer needed. +func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + return nil, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) + } + root := mntns.Root() + + // Create the file that will be write/read. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + FollowFinalSymlink: true, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: 0644, + }) + if err != nil { + root.DecRef() + mntns.DecRef(vfsObj) + return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err) + } + + return fd, func() { + root.DecRef() + mntns.DecRef(vfsObj) + }, nil +} + +// Test that we can write some data to a file and read it back.` +func TestSimpleWriteRead(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, "simpleReadWrite") + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Write. + data := []byte("foobarbaz") + n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Write got short write length %d, want %d", n, len(data)) + } + if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want { + t.Errorf("fd.Write left offset at %d, want %d", got, want) + } + + // Seek back to beginning. + if _, err := fd.Seek(ctx, 0, linux.SEEK_SET); err != nil { + t.Fatalf("fd.Seek failed: %v", err) + } + if got, want := fd.Impl().(*regularFileFD).off, int64(0); got != want { + t.Errorf("fd.Seek(0) left offset at %d, want %d", got, want) + } + + // Read. + buf := make([]byte, len(data)) + n, err = fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) + if err != nil && err != io.EOF { + t.Fatalf("fd.Read failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Read got short read length %d, want %d", n, len(data)) + } + if got, want := string(buf), string(data); got != want { + t.Errorf("Read got %q want %s", got, want) + } + if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want { + t.Errorf("fd.Write left offset at %d, want %d", got, want) + } +} + +func TestPWrite(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, "PRead") + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Fill file with 1k 'a's. + data := bytes.Repeat([]byte{'a'}, 1000) + n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Write got short write length %d, want %d", n, len(data)) + } + + // Write "gVisor is awesome" at various offsets. + buf := []byte("gVisor is awesome") + offsets := []int{0, 1, 2, 10, 20, 50, 100, len(data) - 100, len(data) - 1, len(data), len(data) + 1} + for _, offset := range offsets { + name := fmt.Sprintf("PWrite offset=%d", offset) + t.Run(name, func(t *testing.T) { + n, err := fd.PWrite(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.WriteOptions{}) + if err != nil { + t.Errorf("fd.PWrite got err %v want nil", err) + } + if n != int64(len(buf)) { + t.Errorf("fd.PWrite got %d bytes want %d", n, len(buf)) + } + + // Update data to reflect expected file contents. + if len(data) < offset+len(buf) { + data = append(data, make([]byte, (offset+len(buf))-len(data))...) + } + copy(data[offset:], buf) + + // Read the whole file and compare with data. + readBuf := make([]byte, len(data)) + n, err = fd.PRead(ctx, usermem.BytesIOSequence(readBuf), 0, vfs.ReadOptions{}) + if err != nil { + t.Fatalf("fd.PRead failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.PRead got short read length %d, want %d", n, len(data)) + } + if got, want := string(readBuf), string(data); got != want { + t.Errorf("PRead got %q want %s", got, want) + } + + }) + } +} + +func TestPRead(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, "PRead") + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Write 100 sequences of 'gVisor is awesome'. + data := bytes.Repeat([]byte("gVisor is awsome"), 100) + n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Write got short write length %d, want %d", n, len(data)) + } + + // Read various sizes from various offsets. + sizes := []int{0, 1, 2, 10, 20, 50, 100, 1000} + offsets := []int{0, 1, 2, 10, 20, 50, 100, 1000, len(data) - 100, len(data) - 1, len(data), len(data) + 1} + + for _, size := range sizes { + for _, offset := range offsets { + name := fmt.Sprintf("PRead offset=%d size=%d", offset, size) + t.Run(name, func(t *testing.T) { + var ( + wantRead []byte + wantErr error + ) + if offset < len(data) { + wantRead = data[offset:] + } else if size > 0 { + wantErr = io.EOF + } + if offset+size < len(data) { + wantRead = wantRead[:size] + } + buf := make([]byte, size) + n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.ReadOptions{}) + if err != wantErr { + t.Errorf("fd.PRead got err %v want %v", err, wantErr) + } + if n != int64(len(wantRead)) { + t.Errorf("fd.PRead got %d bytes want %d", n, len(wantRead)) + } + if got := string(buf[:n]); got != string(wantRead) { + t.Errorf("fd.PRead got %q want %q", got, string(wantRead)) + } + }) + } + } +} diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go index b2ac2cbeb..5246aca84 100644 --- a/pkg/sentry/fsimpl/memfs/symlink.go +++ b/pkg/sentry/fsimpl/tmpfs/symlink.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package memfs +package tmpfs import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 4cb2a4e0f..7be6faa5b 100644 --- a/pkg/sentry/fsimpl/memfs/memfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -12,29 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package memfs provides a filesystem implementation that behaves like tmpfs: +// Package tmpfs provides a filesystem implementation that behaves like tmpfs: // the Dentry tree is the sole source of truth for the state of the filesystem. // -// memfs is intended primarily to demonstrate filesystem implementation -// patterns. Real uses cases for an in-memory filesystem should use tmpfs -// instead. -// // Lock order: // // filesystem.mu // regularFileFD.offMu // regularFile.mu // inode.mu -package memfs +package tmpfs import ( "fmt" + "math" "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -46,6 +44,9 @@ type FilesystemType struct{} type filesystem struct { vfsfs vfs.Filesystem + // memFile is used to allocate pages to for regular files. + memFile *pgalloc.MemoryFile + // mu serializes changes to the Dentry tree. mu sync.RWMutex @@ -54,7 +55,13 @@ type filesystem struct { // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - var fs filesystem + memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) + if memFileProvider == nil { + panic("MemoryFileProviderFromContext returned nil") + } + fs := filesystem{ + memFile: memFileProvider.MemoryFile(), + } fs.vfsfs.Init(vfsObj, &fs) root := fs.newDentry(fs.newDirectory(creds, 01777)) return &fs.vfsfs, &root.vfsd, nil @@ -64,12 +71,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt func (fs *filesystem) Release() { } -// Sync implements vfs.FilesystemImpl.Sync. -func (fs *filesystem) Sync(ctx context.Context) error { - // All filesystem state is in-memory. - return nil -} - // dentry implements vfs.DentryImpl. type dentry struct { vfsd vfs.Dentry @@ -79,11 +80,11 @@ type dentry struct { // immutable. inode *inode - // memfs doesn't count references on dentries; because the dentry tree is + // tmpfs doesn't count references on dentries; because the dentry tree is // the sole source of truth, it is by definition always consistent with the // state of the filesystem. However, it does count references on inodes, // because inode resources are released when all references are dropped. - // (memfs doesn't really have resources to release, but we implement + // (tmpfs doesn't really have resources to release, but we implement // reference counting because tmpfs regular files will.) // dentryEntry (ugh) links dentries into their parent directory.childList. @@ -137,6 +138,8 @@ type inode struct { impl interface{} // immutable } +const maxLinks = math.MaxUint32 + func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { i.refs = 1 i.mode = uint32(mode) @@ -147,25 +150,33 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, i.impl = impl } -// Preconditions: filesystem.mu must be locked for writing. +// incLinksLocked increments i's link count. +// +// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. +// i.nlink < maxLinks. func (i *inode) incLinksLocked() { - if atomic.AddUint32(&i.nlink, 1) <= 1 { - panic("memfs.inode.incLinksLocked() called with no existing links") + if i.nlink == 0 { + panic("tmpfs.inode.incLinksLocked() called with no existing links") } + if i.nlink == maxLinks { + panic("memfs.inode.incLinksLocked() called with maximum link count") + } + atomic.AddUint32(&i.nlink, 1) } -// Preconditions: filesystem.mu must be locked for writing. +// decLinksLocked decrements i's link count. +// +// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. func (i *inode) decLinksLocked() { - if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 { - i.decRef() - } else if nlink == ^uint32(0) { // negative overflow - panic("memfs.inode.decLinksLocked() called with no existing links") + if i.nlink == 0 { + panic("tmpfs.inode.decLinksLocked() called with no existing links") } + atomic.AddUint32(&i.nlink, ^uint32(0)) } func (i *inode) incRef() { if atomic.AddInt64(&i.refs, 1) <= 1 { - panic("memfs.inode.incRef() called without holding a reference") + panic("tmpfs.inode.incRef() called without holding a reference") } } @@ -184,14 +195,14 @@ func (i *inode) tryIncRef() bool { func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { // This is unnecessary; it's mostly to simulate what tmpfs would do. - if regfile, ok := i.impl.(*regularFile); ok { - regfile.mu.Lock() - regfile.data = nil - atomic.StoreInt64(®file.dataLen, 0) - regfile.mu.Unlock() + if regFile, ok := i.impl.(*regularFile); ok { + regFile.mu.Lock() + regFile.data.DropAll(regFile.memFile) + atomic.StoreUint64(®File.size, 0) + regFile.mu.Unlock() } } else if refs < 0 { - panic("memfs.inode.decRef() called without holding a reference") + panic("tmpfs.inode.decRef() called without holding a reference") } } @@ -215,7 +226,7 @@ func (i *inode) statTo(stat *linux.Statx) { case *regularFile: stat.Mode |= linux.S_IFREG stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS - stat.Size = uint64(atomic.LoadInt64(&impl.dataLen)) + stat.Size = uint64(atomic.LoadUint64(&impl.size)) // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in // a uint64 accessed using atomic memory operations to avoid taking // locks). @@ -256,13 +267,11 @@ func (i *inode) direntType() uint8 { } } -// fileDescription is embedded by memfs implementations of +// fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl - - flags uint32 // status flags; immutable } func (fd *fileDescription) filesystem() *filesystem { @@ -273,18 +282,6 @@ func (fd *fileDescription) inode() *inode { return fd.vfsfd.Dentry().Impl().(*dentry).inode } -// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. -func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) { - return fd.flags, nil -} - -// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. -func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - // None of the flags settable by fcntl(F_SETFL) are supported, so this is a - // no-op. - return nil -} - // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 28ba950bd..8653d2f63 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -762,7 +762,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, mounts.IncRef() } - tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) + tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) ctx := args.NewContext(k) // Get the root directory from the MountNamespace. @@ -841,9 +841,11 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, AbstractSocketNamespace: args.AbstractSocketNamespace, ContainerID: args.ContainerID, } - if _, err := k.tasks.NewTask(config); err != nil { + t, err := k.tasks.NewTask(config) + if err != nil { return nil, 0, err } + t.traceExecEvent(tc) // Simulate exec for tracing. // Success. tgid := k.tasks.Root.IDOfThreadGroup(tg) @@ -1118,6 +1120,22 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { return lastErr } +// RebuildTraceContexts rebuilds the trace context for all tasks. +// +// Unfortunately, if these are built while tracing is not enabled, then we will +// not have meaningful trace data. Rebuilding here ensures that we can do so +// after tracing has been enabled. +func (k *Kernel) RebuildTraceContexts() { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + + for t, tid := range k.tasks.Root.tids { + t.rebuildTraceContext(tid) + } +} + // FeatureSet returns the FeatureSet. func (k *Kernel) FeatureSet() *cpuid.FeatureSet { return k.featureSet @@ -1173,6 +1191,11 @@ func (k *Kernel) GlobalInit() *ThreadGroup { return k.globalInit } +// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace. +func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) { + k.globalInit = tg +} + // ApplicationCores returns the number of CPUs visible to sandboxed // applications. func (k *Kernel) ApplicationCores() uint { diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index 24ea002ba..b14429854 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -15,17 +15,29 @@ package kernel import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) -// Restartable sequences, as described in https://lwn.net/Articles/650333/. +// Restartable sequences. +// +// We support two different APIs for restartable sequences. +// +// 1. The upstream interface added in v4.18. +// 2. The interface described in https://lwn.net/Articles/650333/. +// +// Throughout this file and other parts of the kernel, the latter is referred +// to as "old rseq". This interface was never merged upstream, but is supported +// for a limited set of applications that use it regardless. -// RSEQCriticalRegion describes a restartable sequence critical region. +// OldRSeqCriticalRegion describes an old rseq critical region. // // +stateify savable -type RSEQCriticalRegion struct { +type OldRSeqCriticalRegion struct { // When a task in this thread group has its CPU preempted (as defined by // platform.ErrContextCPUPreempted) or has a signal delivered to an // application handler while its instruction pointer is in CriticalSection, @@ -35,86 +47,359 @@ type RSEQCriticalRegion struct { Restart usermem.Addr } -// RSEQAvailable returns true if t supports restartable sequences. -func (t *Task) RSEQAvailable() bool { +// RSeqAvailable returns true if t supports (old and new) restartable sequences. +func (t *Task) RSeqAvailable() bool { return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption() } -// RSEQCriticalRegion returns a copy of t's thread group's current restartable -// sequence. -func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion { - return *t.tg.rscr.Load().(*RSEQCriticalRegion) +// SetRSeq registers addr as this thread's rseq structure. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error { + if t.rseqAddr != 0 { + if t.rseqAddr != addr { + return syserror.EINVAL + } + if t.rseqSignature != signature { + return syserror.EINVAL + } + return syserror.EBUSY + } + + // rseq must be aligned and correctly sized. + if addr&(linux.AlignOfRSeq-1) != 0 { + return syserror.EINVAL + } + if length != linux.SizeOfRSeq { + return syserror.EINVAL + } + if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok { + return syserror.EFAULT + } + + t.rseqAddr = addr + t.rseqSignature = signature + + // Initialize the CPUID. + // + // Linux implicitly does this on return from userspace, where failure + // would cause SIGSEGV. + if err := t.rseqUpdateCPU(); err != nil { + t.rseqAddr = 0 + t.rseqSignature = 0 + + t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return syserror.EFAULT + } + + return nil } -// SetRSEQCriticalRegion replaces t's thread group's restartable sequence. +// ClearRSeq unregisters addr as this thread's rseq structure. // -// Preconditions: t.RSEQAvailable() == true. -func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error { +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error { + if t.rseqAddr == 0 { + return syserror.EINVAL + } + if t.rseqAddr != addr { + return syserror.EINVAL + } + if length != linux.SizeOfRSeq { + return syserror.EINVAL + } + if t.rseqSignature != signature { + return syserror.EPERM + } + + if err := t.rseqClearCPU(); err != nil { + return err + } + + t.rseqAddr = 0 + t.rseqSignature = 0 + + if t.oldRSeqCPUAddr == 0 { + // rseqCPU no longer needed. + t.rseqCPU = -1 + } + + return nil +} + +// OldRSeqCriticalRegion returns a copy of t's thread group's current +// old restartable sequence. +func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion { + return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) +} + +// SetOldRSeqCriticalRegion replaces t's thread group's old restartable +// sequence. +// +// Preconditions: t.RSeqAvailable() == true. +func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error { // These checks are somewhat more lenient than in Linux, which (bizarrely) - // requires rscr.CriticalSection to be non-empty and rscr.Restart to be - // outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0 + // requires r.CriticalSection to be non-empty and r.Restart to be + // outside of r.CriticalSection, even if r.CriticalSection.Start == 0 // (which disables the critical region). - if rscr.CriticalSection.Start == 0 { - rscr.CriticalSection.End = 0 - rscr.Restart = 0 - t.tg.rscr.Store(&rscr) + if r.CriticalSection.Start == 0 { + r.CriticalSection.End = 0 + r.Restart = 0 + t.tg.oldRSeqCritical.Store(&r) return nil } - if rscr.CriticalSection.Start >= rscr.CriticalSection.End { + if r.CriticalSection.Start >= r.CriticalSection.End { return syserror.EINVAL } - if rscr.CriticalSection.Contains(rscr.Restart) { + if r.CriticalSection.Contains(r.Restart) { return syserror.EINVAL } - // TODO(jamieliu): check that rscr.CriticalSection and rscr.Restart are in - // the application address range, for consistency with Linux - t.tg.rscr.Store(&rscr) + // TODO(jamieliu): check that r.CriticalSection and r.Restart are in + // the application address range, for consistency with Linux. + t.tg.oldRSeqCritical.Store(&r) return nil } -// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU -// number. +// OldRSeqCPUAddr returns the address that old rseq will keep updated with t's +// CPU number. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) RSEQCPUAddr() usermem.Addr { - return t.rseqCPUAddr +func (t *Task) OldRSeqCPUAddr() usermem.Addr { + return t.oldRSeqCPUAddr } -// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU -// number. +// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with +// t's CPU number. // -// Preconditions: t.RSEQAvailable() == true. The caller must be running on the +// Preconditions: t.RSeqAvailable() == true. The caller must be running on the // task goroutine. t's AddressSpace must be active. -func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error { - t.rseqCPUAddr = addr - if addr != 0 { - t.rseqCPU = int32(hostcpu.GetCPU()) - if err := t.rseqCopyOutCPU(); err != nil { - t.rseqCPUAddr = 0 - t.rseqCPU = -1 - return syserror.EINVAL // yes, EINVAL, not err or EFAULT - } - } else { - t.rseqCPU = -1 +func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error { + t.oldRSeqCPUAddr = addr + + // Check that addr is writable. + // + // N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's + // unfortunate, but unlikely in a correct program. + if err := t.rseqUpdateCPU(); err != nil { + t.oldRSeqCPUAddr = 0 + return syserror.EINVAL // yes, EINVAL, not err or EFAULT } return nil } // Preconditions: The caller must be running on the task goroutine. t's // AddressSpace must be active. -func (t *Task) rseqCopyOutCPU() error { +func (t *Task) rseqUpdateCPU() error { + if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 { + t.rseqCPU = -1 + return nil + } + + t.rseqCPU = int32(hostcpu.GetCPU()) + + // Update both CPUs, even if one fails. + rerr := t.rseqCopyOutCPU() + oerr := t.oldRSeqCopyOutCPU() + + if rerr != nil { + return rerr + } + return oerr +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) oldRSeqCopyOutCPU() error { + if t.oldRSeqCPUAddr == 0 { + return nil + } + buf := t.CopyScratchBuffer(4) usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) - _, err := t.CopyOutBytes(t.rseqCPUAddr, buf) + _, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf) + return err +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqCopyOutCPU() error { + if t.rseqAddr == 0 { + return nil + } + + buf := t.CopyScratchBuffer(8) + // CPUIDStart and CPUID are the first two fields in linux.RSeq. + usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart + usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID + // N.B. This write is not atomic, but since this occurs on the task + // goroutine then as long as userspace uses a single-instruction read + // it can't see an invalid value. + _, err := t.CopyOutBytes(t.rseqAddr, buf) + return err +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqClearCPU() error { + buf := t.CopyScratchBuffer(8) + // CPUIDStart and CPUID are the first two fields in linux.RSeq. + usermem.ByteOrder.PutUint32(buf, 0) // CPUIDStart + usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID + // N.B. This write is not atomic, but since this occurs on the task + // goroutine then as long as userspace uses a single-instruction read + // it can't see an invalid value. + _, err := t.CopyOutBytes(t.rseqAddr, buf) return err } +// rseqAddrInterrupt checks if IP is in a critical section, and aborts if so. +// +// This is a bit complex since both the RSeq and RSeqCriticalSection structs +// are stored in userspace. So we must: +// +// 1. Copy in the address of RSeqCriticalSection from RSeq. +// 2. Copy in RSeqCriticalSection itself. +// 3. Validate critical section struct version, address range, abort address. +// 4. Validate the abort signature (4 bytes preceding abort IP match expected +// signature). +// 5. Clear address of RSeqCriticalSection from RSeq. +// 6. Finally, conditionally abort. +// +// See kernel/rseq.c:rseq_ip_fixup for reference. +// +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqAddrInterrupt() { + if t.rseqAddr == 0 { + return + } + + critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection) + if !ok { + // SetRSeq should validate this. + panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr)) + } + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + t.Debugf("Only 64-bit rseq supported.") + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + buf := t.CopyScratchBuffer(8) + if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil { + t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf)) + if critAddr == 0 { + return + } + + buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection) + if _, err := t.CopyInBytes(critAddr, buf); err != nil { + t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + // Manually marshal RSeqCriticalSection as this is in the hot path when + // rseq is enabled. It must be as fast as possible. + // + // TODO(b/130243041): Replace with go_marshal. + cs := linux.RSeqCriticalSection{ + Version: usermem.ByteOrder.Uint32(buf[0:4]), + Flags: usermem.ByteOrder.Uint32(buf[4:8]), + Start: usermem.ByteOrder.Uint64(buf[8:16]), + PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]), + Abort: usermem.ByteOrder.Uint64(buf[24:32]), + } + + if cs.Version != 0 { + t.Debugf("Unknown version in %+v", cs) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + start := usermem.Addr(cs.Start) + critRange, ok := start.ToRange(cs.PostCommitOffset) + if !ok { + t.Debugf("Invalid start and offset in %+v", cs) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + abort := usermem.Addr(cs.Abort) + if critRange.Contains(abort) { + t.Debugf("Abort in critical section in %+v", cs) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + // Verify signature. + sigAddr := abort - linux.SizeOfRSeqSignature + + buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature) + if _, err := t.CopyInBytes(sigAddr, buf); err != nil { + t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + sig := usermem.ByteOrder.Uint32(buf) + if sig != t.rseqSignature { + t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + // Clear the critical section address. + // + // NOTE(b/143949567): We don't support any rseq flags, so we always + // restart if we are in the critical section, and thus *always* clear + // critAddrAddr. + if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + // Finally we can actually decide whether or not to restart. + if !critRange.Contains(usermem.Addr(t.Arch().IP())) { + return + } + + t.Arch().SetIP(uintptr(cs.Abort)) +} + // Preconditions: The caller must be running on the task goroutine. -func (t *Task) rseqInterrupt() { - rscr := t.tg.rscr.Load().(*RSEQCriticalRegion) - if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) { - t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart) - t.Arch().SetIP(uintptr(rscr.Restart)) - t.Arch().SetRSEQInterruptedIP(ip) +func (t *Task) oldRSeqInterrupt() { + r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) + if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) { + t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart) + t.Arch().SetIP(uintptr(r.Restart)) + t.Arch().SetOldRSeqInterruptedIP(ip) } } + +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) rseqInterrupt() { + t.rseqAddrInterrupt() + t.oldRSeqInterrupt() +} diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 93fe68a3e..de9617e9d 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -302,7 +302,7 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred return syserror.ERANGE } - // TODO(b/29354920): Clear undo entries in all processes + // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. sem.value = val sem.pid = pid s.changeTime = ktime.NowFromContext(ctx) @@ -336,7 +336,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti for i, val := range vals { sem := &s.sems[i] - // TODO(b/29354920): Clear undo entries in all processes + // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. sem.value = int16(val) sem.pid = pid sem.wakeWaiters() @@ -481,7 +481,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch } // All operations succeeded, apply them. - // TODO(b/29354920): handle undo operations. + // TODO(gvisor.dev/issue/137): handle undo operations. for i, v := range tmpVals { s.sems[i].value = v s.sems[i].wakeWaiters() diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 5bd610f68..19034a21e 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -71,9 +71,20 @@ type Registry struct { mu sync.Mutex `state:"nosave"` // shms maps segment ids to segments. + // + // shms holds all referenced segments, which are removed on the last + // DecRef. Thus, it cannot itself hold a reference on the Shm. + // + // Since removal only occurs after the last (unlocked) DecRef, there + // exists a short window during which a Shm still exists in Shm, but is + // unreferenced. Users must use TryIncRef to determine if the Shm is + // still valid. shms map[ID]*Shm // keysToShms maps segment keys to segments. + // + // Shms in keysToShms are guaranteed to be referenced, as they are + // removed by disassociateKey before the last DecRef. keysToShms map[Key]*Shm // Sum of the sizes of all existing segments rounded up to page size, in @@ -95,10 +106,18 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry { } // FindByID looks up a segment given an ID. +// +// FindByID returns a reference on Shm. func (r *Registry) FindByID(id ID) *Shm { r.mu.Lock() defer r.mu.Unlock() - return r.shms[id] + s := r.shms[id] + // Take a reference on s. If TryIncRef fails, s has reached the last + // DecRef, but hasn't quite been removed from r.shms yet. + if s != nil && s.TryIncRef() { + return s + } + return nil } // dissociateKey removes the association between a segment and its key, @@ -119,6 +138,8 @@ func (r *Registry) dissociateKey(s *Shm) { // FindOrCreate looks up or creates a segment in the registry. It's functionally // analogous to open(2). +// +// FindOrCreate returns a reference on Shm. func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) { if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) { // "A new segment was to be created and size is less than SHMMIN or @@ -166,6 +187,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui return nil, syserror.EEXIST } + shm.IncRef() return shm, nil } @@ -193,7 +215,14 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui // Need to create a new segment. creator := fs.FileOwnerFromContext(ctx) perms := fs.FilePermsFromMode(mode) - return r.newShm(ctx, pid, key, creator, perms, size) + s, err := r.newShm(ctx, pid, key, creator, perms, size) + if err != nil { + return nil, err + } + // The initial reference is held by s itself. Take another to return to + // the caller. + s.IncRef() + return s, nil } // newShm creates a new segment in the registry. @@ -296,22 +325,26 @@ func (r *Registry) remove(s *Shm) { // Shm represents a single shared memory segment. // -// Shm segment are backed directly by an allocation from platform -// memory. Segments are always mapped as a whole, greatly simplifying how -// mappings are tracked. However note that mremap and munmap calls may cause the -// vma for a segment to become fragmented; which requires special care when -// unmapping a segment. See mm/shm.go. +// Shm segment are backed directly by an allocation from platform memory. +// Segments are always mapped as a whole, greatly simplifying how mappings are +// tracked. However note that mremap and munmap calls may cause the vma for a +// segment to become fragmented; which requires special care when unmapping a +// segment. See mm/shm.go. // // Segments persist until they are explicitly marked for destruction via -// shmctl(SHM_RMID). +// MarkDestroyed(). // // Shm implements memmap.Mappable and memmap.MappingIdentity. // // +stateify savable type Shm struct { - // AtomicRefCount tracks the number of references to this segment from - // maps. A segment always holds a reference to itself, until it's marked for + // AtomicRefCount tracks the number of references to this segment. + // + // A segment holds a reference to itself until it is marked for // destruction. + // + // In addition to direct users, the MemoryManager will hold references + // via MappingIdentity. refs.AtomicRefCount mfp pgalloc.MemoryFileProvider @@ -484,9 +517,8 @@ type AttachOpts struct { // ConfigureAttach creates an mmap configuration for the segment with the // requested attach options. // -// ConfigureAttach returns with a ref on s on success. The caller should drop -// this once the map is installed. This reference prevents s from being -// destroyed before the returned configuration is used. +// Postconditions: The returned MMapOpts are valid only as long as a reference +// continues to be held on s. func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) { s.mu.Lock() defer s.mu.Unlock() @@ -504,7 +536,6 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts Attac // in the user namespace that governs its IPC namespace." - man shmat(2) return memmap.MMapOpts{}, syserror.EACCES } - s.IncRef() return memmap.MMapOpts{ Length: s.size, Offset: 0, @@ -549,10 +580,15 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) { } creds := auth.CredentialsFromContext(ctx) - nattach := uint64(s.ReadRefs()) - // Don't report the self-reference we keep prior to being marked for - // destruction. However, also don't report a count of -1 for segments marked - // as destroyed, with no mappings. + // Use the reference count as a rudimentary count of the number of + // attaches. We exclude: + // + // 1. The reference the caller holds. + // 2. The self-reference held by s prior to destruction. + // + // Note that this may still overcount by including transient references + // used in concurrent calls. + nattach := uint64(s.ReadRefs()) - 1 if !s.pendingDestruction { nattach-- } @@ -620,18 +656,17 @@ func (s *Shm) MarkDestroyed() { s.registry.dissociateKey(s) s.mu.Lock() - // Only drop the segment's self-reference once, when destruction is - // requested. Otherwise, repeated calls to shmctl(IPC_RMID) would force a - // segment to be destroyed prematurely, potentially with active maps to the - // segment's address range. Remaining references are dropped when the - // segment is detached or unmaped. + defer s.mu.Unlock() if !s.pendingDestruction { s.pendingDestruction = true - s.mu.Unlock() // Must release s.mu before calling s.DecRef. + // Drop the self-reference so destruction occurs when all + // external references are gone. + // + // N.B. This cannot be the final DecRef, as the caller also + // holds a reference. s.DecRef() return } - s.mu.Unlock() } // checkOwnership verifies whether a segment may be accessed by ctx as an diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 220fa73a2..2fdee0282 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -339,6 +339,14 @@ func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { return nil } +// LookupName looks up a syscall name. +func (s *SyscallTable) LookupName(sysno uintptr) string { + if sc, ok := s.Table[sysno]; ok { + return sc.Name + } + return fmt.Sprintf("sys_%d", sysno) // Unlikely. +} + // LookupEmulate looks up an emulation syscall number. func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { sysno, ok := s.Emulate[addr] diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 80c8e5464..d25a7903b 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -15,6 +15,8 @@ package kernel import ( + gocontext "context" + "runtime/trace" "sync" "sync/atomic" @@ -390,7 +392,14 @@ type Task struct { // logPrefix is a string containing the task's thread ID in the root PID // namespace, and is prepended to log messages emitted by Task.Infof etc. - logPrefix atomic.Value `state:".(string)"` + logPrefix atomic.Value `state:"nosave"` + + // traceContext and traceTask are both used for tracing, and are + // updated along with the logPrefix in updateInfoLocked. + // + // These are exclusive to the task goroutine. + traceContext gocontext.Context `state:"nosave"` + traceTask *trace.Task `state:"nosave"` // creds is the task's credentials. // @@ -480,18 +489,43 @@ type Task struct { // netns is protected by mu. netns is owned by the task goroutine. netns bool - // If rseqPreempted is true, before the next call to p.Switch(), interrupt - // RSEQ critical regions as defined by tg.rseq and write the task - // goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number - // written to rseqCPUAddr. + // If rseqPreempted is true, before the next call to p.Switch(), + // interrupt rseq critical regions as defined by rseqAddr and + // tg.oldRSeqCritical and write the task goroutine's CPU number to + // rseqAddr/oldRSeqCPUAddr. // - // If rseqCPUAddr is 0, rseqCPU is -1. + // We support two ABIs for restartable sequences: // - // rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task - // goroutine. + // 1. The upstream interface added in v4.18, + // 2. An "old" interface never merged upstream. In the implementation, + // this is referred to as "old rseq". + // + // rseqPreempted is exclusive to the task goroutine. rseqPreempted bool `state:"nosave"` - rseqCPUAddr usermem.Addr - rseqCPU int32 + + // rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr. + // + // If rseq is unused, rseqCPU is -1 for convenient use in + // platform.Context.Switch. + // + // rseqCPU is exclusive to the task goroutine. + rseqCPU int32 + + // oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable. + // + // oldRSeqCPUAddr is exclusive to the task goroutine. + oldRSeqCPUAddr usermem.Addr + + // rseqAddr is a pointer to the userspace linux.RSeq structure. + // + // rseqAddr is exclusive to the task goroutine. + rseqAddr usermem.Addr + + // rseqSignature is the signature that the rseq abort IP must be signed + // with. + // + // rseqSignature is exclusive to the task goroutine. + rseqSignature uint32 // copyScratchBuffer is a buffer available to CopyIn/CopyOut // implementations that require an intermediate buffer to copy data @@ -528,14 +562,6 @@ func (t *Task) loadPtraceTracer(tracer *Task) { t.ptraceTracer.Store(tracer) } -func (t *Task) saveLogPrefix() string { - return t.logPrefix.Load().(string) -} - -func (t *Task) loadLogPrefix(prefix string) { - t.logPrefix.Store(prefix) -} - func (t *Task) saveSyscallFilters() []bpf.Program { if f := t.syscallFilters.Load(); f != nil { return f.([]bpf.Program) @@ -549,6 +575,7 @@ func (t *Task) loadSyscallFilters(filters []bpf.Program) { // afterLoad is invoked by stateify. func (t *Task) afterLoad() { + t.updateInfoLocked() t.interruptChan = make(chan struct{}, 1) t.gosched.State = TaskGoroutineNonexistent if t.stop != nil { diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index dd69939f9..4a4a69ee2 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -16,6 +16,7 @@ package kernel import ( "runtime" + "runtime/trace" "time" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -133,19 +134,24 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { runtime.Gosched() } + region := trace.StartRegion(t.traceContext, blockRegion) select { case <-C: + region.End() t.SleepFinish(true) + // Woken by event. return nil case <-interrupt: + region.End() t.SleepFinish(false) // Return the indicated error on interrupt. return syserror.ErrInterrupted case <-timerChan: - // We've timed out. + region.End() t.SleepFinish(true) + // We've timed out. return syserror.ETIMEDOUT } } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 0916fd658..247bd4aba 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -236,14 +236,19 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } else if opts.NewPIDNamespace { pidns = pidns.NewChild(userns) } + tg := t.tg + rseqAddr := usermem.Addr(0) + rseqSignature := uint32(0) if opts.NewThreadGroup { tg.mounts.IncRef() sh := t.tg.signalHandlers if opts.NewSignalHandlers { sh = sh.Fork() } - tg = t.k.newThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock) + tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) + rseqAddr = t.rseqAddr + rseqSignature = t.rseqSignature } cfg := &TaskConfig{ @@ -260,6 +265,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { UTSNamespace: utsns, IPCNamespace: ipcns, AbstractSocketNamespace: t.abstractSockets, + RSeqAddr: rseqAddr, + RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), } if opts.NewThreadGroup { @@ -299,6 +306,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // nt that it must receive before its task goroutine starts running. tid := nt.k.tasks.Root.IDOfTask(nt) defer nt.Start(tid) + t.traceCloneEvent(tid) // "If fork/clone and execve are allowed by @prog, any child processes will // be constrained to the same filters and system call ABI as the parent." - diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 17a089b90..fa6528386 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -129,6 +129,7 @@ type runSyscallAfterExecStop struct { } func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { + t.traceExecEvent(r.tc) t.tg.pidns.owner.mu.Lock() t.tg.execing = nil if t.killed() { @@ -189,9 +190,11 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.updateRSSLocked() // Restartable sequence state is discarded. t.rseqPreempted = false - t.rseqCPUAddr = 0 t.rseqCPU = -1 - t.tg.rscr.Store(&RSEQCriticalRegion{}) + t.rseqAddr = 0 + t.rseqSignature = 0 + t.oldRSeqCPUAddr = 0 + t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) t.tg.pidns.owner.mu.Unlock() // Remove FDs with the CloseOnExec flag set. @@ -253,7 +256,7 @@ func (t *Task) promoteLocked() { t.tg.leader = t t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t]) - t.updateLogPrefixLocked() + t.updateInfoLocked() // Reap the original leader. If it has a tracer, detach it instead of // waiting for it to acknowledge the original leader's death. oldLeader.exitParentNotified = true diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 535f03e50..435761e5a 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -236,6 +236,7 @@ func (*runExit) execute(t *Task) taskRunState { type runExitMain struct{} func (*runExitMain) execute(t *Task) taskRunState { + t.traceExitEvent() lastExiter := t.exitThreadGroup() // If the task has a cleartid, and the thread group wasn't killed by a diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index a29e9b9eb..0fb3661de 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -16,6 +16,7 @@ package kernel import ( "fmt" + "runtime/trace" "sort" "gvisor.dev/gvisor/pkg/log" @@ -127,11 +128,88 @@ func (t *Task) debugDumpStack() { } } -// updateLogPrefix updates the task's cached log prefix to reflect its -// current thread ID. +// trace definitions. +// +// Note that all region names are prefixed by ':' in order to ensure that they +// are lexically ordered before all system calls, which use the naked system +// call name (e.g. "read") for maximum clarity. +const ( + traceCategory = "task" + runRegion = ":run" + blockRegion = ":block" + cpuidRegion = ":cpuid" + faultRegion = ":fault" +) + +// updateInfoLocked updates the task's cached log prefix and tracing +// information to reflect its current thread ID. // // Preconditions: The task's owning TaskSet.mu must be locked. -func (t *Task) updateLogPrefixLocked() { +func (t *Task) updateInfoLocked() { // Use the task's TID in the root PID namespace for logging. - t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t])) + tid := t.tg.pidns.owner.Root.tids[t] + t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid)) + t.rebuildTraceContext(tid) +} + +// rebuildTraceContext rebuilds the trace context. +// +// Precondition: the passed tid must be the tid in the root namespace. +func (t *Task) rebuildTraceContext(tid ThreadID) { + // Re-initialize the trace context. + if t.traceTask != nil { + t.traceTask.End() + } + + // Note that we define the "task type" to be the dynamic TID. This does + // not align perfectly with the documentation for "tasks" in the + // tracing package. Tasks may be assumed to be bounded by analysis + // tools. However, if we just use a generic "task" type here, then the + // "user-defined tasks" page on the tracing dashboard becomes nearly + // unusable, as it loads all traces from all tasks. + // + // We can assume that the number of tasks in the system is not + // arbitrarily large (in general it won't be, especially for cases + // where we're collecting a brief profile), so using the TID is a + // reasonable compromise in this case. + t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid)) +} + +// traceCloneEvent is called when a new task is spawned. +// +// ntid must be the new task's ThreadID in the root namespace. +func (t *Task) traceCloneEvent(ntid ThreadID) { + if !trace.IsEnabled() { + return + } + trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid) +} + +// traceExitEvent is called when a task exits. +func (t *Task) traceExitEvent() { + if !trace.IsEnabled() { + return + } + trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status()) +} + +// traceExecEvent is called when a task calls exec. +func (t *Task) traceExecEvent(tc *TaskContext) { + if !trace.IsEnabled() { + return + } + d := tc.MemoryManager.Executable() + if d == nil { + trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") + return + } + defer d.DecRef() + root := t.fsContext.RootDirectory() + if root == nil { + trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>") + return + } + defer root.DecRef() + n, _ := d.FullName(root) + trace.Logf(t.traceContext, traceCategory, "exec: %s", n) } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index c92266c59..6357273d3 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -17,6 +17,7 @@ package kernel import ( "bytes" "runtime" + "runtime/trace" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -168,12 +169,22 @@ func (*runApp) execute(t *Task) taskRunState { // Apply restartable sequences. if t.rseqPreempted { t.rseqPreempted = false - if t.rseqCPUAddr != 0 { + if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 { + // Linux writes the CPU on every preemption. We only do + // so if it changed. Thus we may delay delivery of + // SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid. cpu := int32(hostcpu.GetCPU()) if t.rseqCPU != cpu { t.rseqCPU = cpu if err := t.rseqCopyOutCPU(); err != nil { - t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err) + t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) + t.forceSignal(linux.SIGSEGV, false) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + // Re-enter the task run loop for signal delivery. + return (*runApp)(nil) + } + if err := t.oldRSeqCopyOutCPU(); err != nil { + t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err) t.forceSignal(linux.SIGSEGV, false) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) // Re-enter the task run loop for signal delivery. @@ -205,9 +216,11 @@ func (*runApp) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.RUnlock() } + region := trace.StartRegion(t.traceContext, runRegion) t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) + region.End() if clearSinglestep { t.Arch().ClearSingleStep() @@ -225,6 +238,7 @@ func (*runApp) execute(t *Task) taskRunState { case platform.ErrContextSignalCPUID: // Is this a CPUID instruction? + region := trace.StartRegion(t.traceContext, cpuidRegion) expected := arch.CPUIDInstruction[:] found := make([]byte, len(expected)) _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) @@ -232,10 +246,12 @@ func (*runApp) execute(t *Task) taskRunState { // Skip the cpuid instruction. t.Arch().CPUIDEmulate(t) t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) + region.End() // Resume execution. return (*runApp)(nil) } + region.End() // Not an actual CPUID, but required copy-in. // The instruction at the given RIP was not a CPUID, and we // fallthrough to the default signal deliver behavior below. @@ -251,8 +267,10 @@ func (*runApp) execute(t *Task) taskRunState { // an application-generated signal and we should continue execution // normally. if at.Any() { + region := trace.StartRegion(t.traceContext, faultRegion) addr := usermem.Addr(info.Addr()) err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack())) + region.End() if err == nil { // The fault was handled appropriately. // We can resume running the application. @@ -260,6 +278,12 @@ func (*runApp) execute(t *Task) taskRunState { } // Is this a vsyscall that we need emulate? + // + // Note that we don't track vsyscalls as part of a + // specific trace region. This is because regions don't + // stack, and the actual system call will count as a + // region. We should be able to easily identify + // vsyscalls by having a <fault><syscall> pair. if at.Execute { if sysno, ok := t.tc.st.LookupEmulate(addr); ok { return t.doVsyscall(addr, sysno) @@ -306,7 +330,7 @@ func (*runApp) execute(t *Task) taskRunState { return (*runApp)(nil) case platform.ErrContextCPUPreempted: - // Ensure that RSEQ critical sections are interrupted and per-thread + // Ensure that rseq critical sections are interrupted and per-thread // CPU values are updated before the next platform.Context.Switch(). t.rseqPreempted = true return (*runApp)(nil) diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index ae6fc4025..58af16ee2 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) @@ -79,6 +80,13 @@ type TaskConfig struct { // AbstractSocketNamespace is the AbstractSocketNamespace of the new task. AbstractSocketNamespace *AbstractSocketNamespace + // RSeqAddr is a pointer to the the userspace linux.RSeq structure. + RSeqAddr usermem.Addr + + // RSeqSignature is the signature that the rseq abort IP must be signed + // with. + RSeqSignature uint32 + // ContainerID is the container the new task belongs to. ContainerID string } @@ -126,6 +134,8 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { ipcns: cfg.IPCNamespace, abstractSockets: cfg.AbstractSocketNamespace, rseqCPU: -1, + rseqAddr: cfg.RSeqAddr, + rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, } @@ -154,10 +164,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { // Below this point, newTask is expected not to fail (there is no rollback // of assignTIDsLocked or any of the following). - // Logging on t's behalf will panic if t.logPrefix hasn't been initialized. - // This is the earliest point at which we can do so (since t now has thread - // IDs). - t.updateLogPrefixLocked() + // Logging on t's behalf will panic if t.logPrefix hasn't been + // initialized. This is the earliest point at which we can do so + // (since t now has thread IDs). + t.updateInfoLocked() if cfg.InheritParent != nil { t.parent = cfg.InheritParent.parent diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index b543d536a..3180f5560 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -17,6 +17,7 @@ package kernel import ( "fmt" "os" + "runtime/trace" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -160,6 +161,10 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u ctrl = ctrlStopAndReinvokeSyscall } else { fn := s.Lookup(sysno) + var region *trace.Region // Only non-nil if tracing == true. + if trace.IsEnabled() { + region = trace.StartRegion(t.traceContext, s.LookupName(sysno)) + } if fn != nil { // Call our syscall implementation. rval, ctrl, err = fn(t, args) @@ -167,6 +172,9 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u // Use the missing function if not found. rval, err = t.SyscallTable().Missing(t, sysno, args) } + if region != nil { + region.End() + } } if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) { diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 72568d296..c0197a563 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -238,8 +238,8 @@ type ThreadGroup struct { // execed is protected by the TaskSet mutex. execed bool - // rscr is the thread group's RSEQ critical region. - rscr atomic.Value `state:".(*RSEQCriticalRegion)"` + // oldRSeqCritical is the thread group's old rseq critical region. + oldRSeqCritical atomic.Value `state:".(*OldRSeqCriticalRegion)"` // mounts is the thread group's mount namespace. This does not really // correspond to a "mount namespace" in Linux, but is more like a @@ -256,35 +256,35 @@ type ThreadGroup struct { tty *TTY } -// newThreadGroup returns a new, empty thread group in PID namespace ns. The +// NewThreadGroup returns a new, empty thread group in PID namespace ns. The // thread group leader will send its parent terminationSignal when it exits. // The new thread group isn't visible to the system until a task has been // created inside of it by a successful call to TaskSet.NewTask. -func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup { +func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup { tg := &ThreadGroup{ threadGroupNode: threadGroupNode{ - pidns: ns, + pidns: pidns, }, signalHandlers: sh, terminationSignal: terminationSignal, ioUsage: &usage.IO{}, limits: limits, - mounts: mounts, + mounts: mntns, } tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg}) tg.timers = make(map[linux.TimerID]*IntervalTimer) - tg.rscr.Store(&RSEQCriticalRegion{}) + tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) return tg } -// saveRscr is invoked by stateify. -func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion { - return tg.rscr.Load().(*RSEQCriticalRegion) +// saveOldRSeqCritical is invoked by stateify. +func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion { + return tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) } -// loadRscr is invoked by stateify. -func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) { - tg.rscr.Store(rscr) +// loadOldRSeqCritical is invoked by stateify. +func (tg *ThreadGroup) loadOldRSeqCritical(r *OldRSeqCriticalRegion) { + tg.oldRSeqCritical.Store(r) } // SignalHandlers returns the signal handlers used by tg. diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go index 34f84487a..048de26dc 100644 --- a/pkg/sentry/kernel/tty.go +++ b/pkg/sentry/kernel/tty.go @@ -21,8 +21,19 @@ import "sync" // // +stateify savable type TTY struct { + // Index is the terminal index. It is immutable. + Index uint32 + mu sync.Mutex `state:"nosave"` // tg is protected by mu. tg *ThreadGroup } + +// TTY returns the thread group's controlling terminal. If nil, there is no +// controlling terminal. +func (tg *ThreadGroup) TTY() *TTY { + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + return tg.tty +} diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index c2c3ec06e..6299a3e2f 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -408,6 +408,8 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el start = vaddr } if vaddr < end { + // NOTE(b/37474556): Linux allows out-of-order + // segments, in violation of the spec. ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end) return loadedELF{}, syserror.ENOEXEC } diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index 3ef84245b..112794e9c 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -41,7 +41,6 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", - "//pkg/refs", "//pkg/sentry/context", "//pkg/sentry/platform", "//pkg/sentry/usermem", diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index 03b99aaea..16a722a13 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -18,7 +18,6 @@ package memmap import ( "fmt" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -235,8 +234,11 @@ type InvalidateOpts struct { // coincidental; fs.File implements MappingIdentity, and some // fs.InodeOperations implement Mappable.) type MappingIdentity interface { - // MappingIdentity is reference-counted. - refs.RefCounter + // IncRef increments the MappingIdentity's reference count. + IncRef() + + // DecRef decrements the MappingIdentity's reference count. + DecRef() // MappedName returns the application-visible name shown in // /proc/[pid]/maps. diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go index 8c2246bb4..79610acb7 100644 --- a/pkg/sentry/mm/procfs.go +++ b/pkg/sentry/mm/procfs.go @@ -66,8 +66,6 @@ func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer var start usermem.Addr for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { - // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get - // "panic: autosave error: type usermem.Addr is not registered". mm.appendVMAMapsEntryLocked(ctx, vseg, buf) } @@ -81,7 +79,6 @@ func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer // // Artifically adjust the seqfile handle so we only output vsyscall entry once. if start != vsyscallEnd { - // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd. buf.WriteString(vsyscallMapsEntry) } } @@ -97,8 +94,6 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile start = *handle.(*usermem.Addr) } for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { - // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get - // "panic: autosave error: type usermem.Addr is not registered". vmaAddr := vseg.End() data = append(data, seqfile.SeqData{ Buf: mm.vmaMapsEntryLocked(ctx, vseg), @@ -116,7 +111,6 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile // // Artifically adjust the seqfile handle so we only output vsyscall entry once. if start != vsyscallEnd { - // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd. vmaAddr := vsyscallEnd data = append(data, seqfile.SeqData{ Buf: []byte(vsyscallMapsEntry), @@ -187,15 +181,12 @@ func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffe var start usermem.Addr for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { - // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get - // "panic: autosave error: type usermem.Addr is not registered". mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf) } // We always emulate vsyscall, so advertise it here. See // ReadMapsSeqFileData for additional commentary. if start != vsyscallEnd { - // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd. buf.WriteString(vsyscallSmapsEntry) } } @@ -211,8 +202,6 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil start = *handle.(*usermem.Addr) } for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { - // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get - // "panic: autosave error: type usermem.Addr is not registered". vmaAddr := vseg.End() data = append(data, seqfile.SeqData{ Buf: mm.vmaSmapsEntryLocked(ctx, vseg), @@ -223,7 +212,6 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil // We always emulate vsyscall, so advertise it here. See // ReadMapsSeqFileData for additional commentary. if start != vsyscallEnd { - // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd. vmaAddr := vsyscallEnd data = append(data, seqfile.SeqData{ Buf: []byte(vsyscallSmapsEntry), diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 6803d488c..f3afd98da 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -12,20 +12,30 @@ go_library( "bluepill_amd64.go", "bluepill_amd64.s", "bluepill_amd64_unsafe.go", + "bluepill_arm64.go", + "bluepill_arm64.s", + "bluepill_arm64_unsafe.go", "bluepill_fault.go", "bluepill_unsafe.go", "context.go", - "filters.go", + "filters_amd64.go", + "filters_arm64.go", "kvm.go", "kvm_amd64.go", "kvm_amd64_unsafe.go", + "kvm_arm64.go", + "kvm_arm64_unsafe.go", "kvm_const.go", + "kvm_const_arm64.go", "machine.go", "machine_amd64.go", "machine_amd64_unsafe.go", "machine_arm64.go", + "machine_arm64_unsafe.go", "machine_unsafe.go", "physical_map.go", + "physical_map_amd64.go", + "physical_map_arm64.go", "virtual_map.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm", diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go index 043de51b3..30dbb74d6 100644 --- a/pkg/sentry/platform/kvm/bluepill.go +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -20,6 +20,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" ) @@ -36,6 +37,18 @@ func sighandler() func dieTrampoline() var ( + // bounceSignal is the signal used for bouncing KVM. + // + // We use SIGCHLD because it is not masked by the runtime, and + // it will be ignored properly by other parts of the kernel. + bounceSignal = syscall.SIGCHLD + + // bounceSignalMask has only bounceSignal set. + bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1)) + + // bounce is the interrupt vector used to return to the kernel. + bounce = uint32(ring0.VirtualizationException) + // savedHandler is a pointer to the previous handler. // // This is called by bluepillHandler. @@ -45,6 +58,13 @@ var ( dieTrampolineAddr uintptr ) +// redpill invokes a syscall with -1. +// +//go:nosplit +func redpill() { + syscall.RawSyscall(^uintptr(0), 0, 0, 0) +} + // dieHandler is called by dieTrampoline. // //go:nosplit @@ -73,8 +93,8 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) { func init() { // Install the handler. - if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil { - panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err)) + if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil { + panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) } // Extract the address for the trampoline. diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index 421c88220..133c2203d 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -24,26 +24,10 @@ import ( ) var ( - // bounceSignal is the signal used for bouncing KVM. - // - // We use SIGCHLD because it is not masked by the runtime, and - // it will be ignored properly by other parts of the kernel. - bounceSignal = syscall.SIGCHLD - - // bounceSignalMask has only bounceSignal set. - bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1)) - - // bounce is the interrupt vector used to return to the kernel. - bounce = uint32(ring0.VirtualizationException) + // The action for bluepillSignal is changed by sigaction(). + bluepillSignal = syscall.SIGSEGV ) -// redpill on amd64 invokes a syscall with -1. -// -//go:nosplit -func redpill() { - syscall.RawSyscall(^uintptr(0), 0, 0, 0) -} - // bluepillArchEnter is called during bluepillEnter. // //go:nosplit diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go index 9d8af143e..a63a6a071 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -23,13 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) -// bluepillArchContext returns the arch-specific context. -// -//go:nosplit -func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { - return &((*arch.UContext64)(context).MContext) -} - // dieArchSetup initializes the state for dieTrampoline. // // The amd64 dieTrampoline requires the vCPU to be set in BX, and the last RIP diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go new file mode 100644 index 000000000..552341721 --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -0,0 +1,79 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0" +) + +var ( + // The action for bluepillSignal is changed by sigaction(). + bluepillSignal = syscall.SIGILL +) + +// bluepillArchEnter is called during bluepillEnter. +// +//go:nosplit +func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) { + c = vCPUPtr(uintptr(context.Regs[8])) + regs := c.CPU.Registers() + regs.Regs = context.Regs + regs.Sp = context.Sp + regs.Pc = context.Pc + regs.Pstate = context.Pstate + regs.Pstate &^= uint64(ring0.KernelFlagsClear) + regs.Pstate |= ring0.KernelFlagsSet + return +} + +// bluepillArchExit is called during bluepillEnter. +// +//go:nosplit +func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { + regs := c.CPU.Registers() + context.Regs = regs.Regs + context.Sp = regs.Sp + context.Pc = regs.Pc + context.Pstate = regs.Pstate + context.Pstate &^= uint64(ring0.UserFlagsClear) + context.Pstate |= ring0.UserFlagsSet +} + +// KernelSyscall handles kernel syscalls. +// +//go:nosplit +func (c *vCPU) KernelSyscall() { + regs := c.Registers() + if regs.Regs[8] != ^uint64(0) { + regs.Pc -= 4 // Rewind. + } + ring0.Halt() +} + +// KernelException handles kernel exceptions. +// +//go:nosplit +func (c *vCPU) KernelException(vector ring0.Vector) { + regs := c.Registers() + if vector == ring0.Vector(bounce) { + regs.Pc = 0 + } + ring0.Halt() +} diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s new file mode 100644 index 000000000..c61700892 --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_arm64.s @@ -0,0 +1,87 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// VCPU_CPU is the location of the CPU in the vCPU struct. +// +// This is guaranteed to be zero. +#define VCPU_CPU 0x0 + +// CPU_SELF is the self reference in ring0's percpu. +// +// This is guaranteed to be zero. +#define CPU_SELF 0x0 + +// Context offsets. +// +// Only limited use of the context is done in the assembly stub below, most is +// done in the Go handlers. +#define SIGINFO_SIGNO 0x0 +#define CONTEXT_PC 0x1B8 +#define CONTEXT_R0 0xB8 + +// See bluepill.go. +TEXT ·bluepill(SB),NOSPLIT,$0 +begin: + MOVD vcpu+0(FP), R8 + MOVD $VCPU_CPU(R8), R9 + ORR $0xffff000000000000, R9, R9 + // Trigger sigill. + // In ring0.Start(), the value of R8 will be stored into tpidr_el1. + // When the context was loaded into vcpu successfully, + // we will check if the value of R10 and R9 are the same. + WORD $0xd538d08a // MRS TPIDR_EL1, R10 +check_vcpu: + CMP R10, R9 + BEQ right_vCPU +wrong_vcpu: + CALL ·redpill(SB) + B begin +right_vCPU: + RET + +// sighandler: see bluepill.go for documentation. +// +// The arguments are the following: +// +// R0 - The signal number. +// R1 - Pointer to siginfo_t structure. +// R2 - Pointer to ucontext structure. +// +TEXT ·sighandler(SB),NOSPLIT,$0 + // si_signo should be sigill. + MOVD SIGINFO_SIGNO(R1), R7 + CMPW $4, R7 + BNE fallback + + MOVD CONTEXT_PC(R2), R7 + CMPW $0, R7 + BEQ fallback + + MOVD R2, 8(RSP) + BL ·bluepillHandler(SB) // Call the handler. + + RET + +fallback: + // Jump to the previous signal handler. + MOVD ·savedHandler(SB), R7 + B (R7) + +// dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation. +TEXT ·dieTrampoline(SB),NOSPLIT,$0 + // TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64. + MOVD R9, 8(RSP) + BL ·dieHandler(SB) diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go new file mode 100644 index 000000000..e5fac0d6a --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go @@ -0,0 +1,28 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "unsafe" + + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +//go:nosplit +func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) { + // TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64. +} diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index ca011ef78..9add7c944 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -23,6 +23,8 @@ import ( "sync/atomic" "syscall" "unsafe" + + "gvisor.dev/gvisor/pkg/sentry/arch" ) //go:linkname throw runtime.throw @@ -49,6 +51,13 @@ func uintptrValue(addr *byte) uintptr { return (uintptr)(unsafe.Pointer(addr)) } +// bluepillArchContext returns the UContext64. +// +//go:nosplit +func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { + return &((*arch.UContext64)(context).MContext) +} + // bluepillHandler is called from the signal stub. // // The world may be stopped while this is executing, and it executes on the diff --git a/pkg/sentry/platform/kvm/filters.go b/pkg/sentry/platform/kvm/filters_amd64.go index 7d949f1dd..7d949f1dd 100644 --- a/pkg/sentry/platform/kvm/filters.go +++ b/pkg/sentry/platform/kvm/filters_amd64.go diff --git a/pkg/sentry/platform/kvm/filters_arm64.go b/pkg/sentry/platform/kvm/filters_arm64.go new file mode 100644 index 000000000..9245d07c2 --- /dev/null +++ b/pkg/sentry/platform/kvm/filters_arm64.go @@ -0,0 +1,32 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// SyscallFilters returns syscalls made exclusively by the KVM platform. +func (*KVM) SyscallFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_IOCTL: {}, + syscall.SYS_MMAP: {}, + syscall.SYS_RT_SIGSUSPEND: {}, + syscall.SYS_RT_SIGTIMEDWAIT: {}, + 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host. + } +} diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index ee4cd2f4d..f2c2c059e 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -21,7 +21,6 @@ import ( "sync" "syscall" - "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" @@ -56,9 +55,7 @@ func New(deviceFile *os.File) (*KVM, error) { // Ensure global initialization is done. globalOnce.Do(func() { - physicalInit() - globalErr = updateSystemValues(int(fd)) - ring0.Init(cpuid.HostFeatureSet()) + globalErr = updateGlobalOnce(int(fd)) }) if globalErr != nil { return nil, globalErr diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go index 5d8ef4761..c5a6f9c7d 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64.go +++ b/pkg/sentry/platform/kvm/kvm_amd64.go @@ -17,6 +17,7 @@ package kvm import ( + "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) @@ -211,3 +212,11 @@ type cpuidEntries struct { _ uint32 entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry } + +// updateGlobalOnce does global initialization. It has to be called only once. +func updateGlobalOnce(fd int) error { + physicalInit() + err := updateSystemValues(int(fd)) + ring0.Init(cpuid.HostFeatureSet()) + return err +} diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go new file mode 100644 index 000000000..2319c86d3 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_arm64.go @@ -0,0 +1,83 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "syscall" +) + +// userMemoryRegion is a region of physical memory. +// +// This mirrors kvm_memory_region. +type userMemoryRegion struct { + slot uint32 + flags uint32 + guestPhysAddr uint64 + memorySize uint64 + userspaceAddr uint64 +} + +type kvmOneReg struct { + id uint64 + addr uint64 +} + +const KVM_NR_SPSR = 5 + +type userFpsimdState struct { + vregs [64]uint64 + fpsr uint32 + fpcr uint32 + reserved [2]uint32 +} + +type userRegs struct { + Regs syscall.PtraceRegs + sp_el1 uint64 + elr_el1 uint64 + spsr [KVM_NR_SPSR]uint64 + fpRegs userFpsimdState +} + +// runData is the run structure. This may be mapped for synchronous register +// access (although that doesn't appear to be supported by my kernel at least). +// +// This mirrors kvm_run. +type runData struct { + requestInterruptWindow uint8 + _ [7]uint8 + + exitReason uint32 + readyForInterruptInjection uint8 + ifFlag uint8 + _ [2]uint8 + + cr8 uint64 + apicBase uint64 + + // This is the union data for exits. Interpretation depends entirely on + // the exitReason above (see vCPU code for more information). + data [32]uint64 +} + +// updateGlobalOnce does global initialization. It has to be called only once. +func updateGlobalOnce(fd int) error { + physicalInit() + err := updateSystemValues(int(fd)) + updateVectorTable() + return err +} diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go new file mode 100644 index 000000000..6531bae1d --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go @@ -0,0 +1,39 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "fmt" + "syscall" +) + +var ( + runDataSize int +) + +func updateSystemValues(fd int) error { + // Extract the mmap size. + sz, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_GET_VCPU_MMAP_SIZE, 0) + if errno != 0 { + return fmt.Errorf("getting VCPU mmap size: %v", errno) + } + // Save the data. + runDataSize = int(sz) + + // Success. + return nil +} diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go index 766131d60..1d5c77ff4 100644 --- a/pkg/sentry/platform/kvm/kvm_const.go +++ b/pkg/sentry/platform/kvm/kvm_const.go @@ -49,11 +49,13 @@ const ( _KVM_EXIT_SHUTDOWN = 0x8 _KVM_EXIT_FAIL_ENTRY = 0x9 _KVM_EXIT_INTERNAL_ERROR = 0x11 + _KVM_EXIT_SYSTEM_EVENT = 0x18 ) // KVM capability options. const ( - _KVM_CAP_MAX_VCPUS = 0x42 + _KVM_CAP_MAX_VCPUS = 0x42 + _KVM_CAP_ARM_VM_IPA_SIZE = 0xa5 ) // KVM limits. diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go new file mode 100644 index 000000000..5a74c6e36 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go @@ -0,0 +1,132 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +// KVM ioctls for Arm64. +const ( + _KVM_GET_ONE_REG = 0x4010aeab + _KVM_SET_ONE_REG = 0x4010aeac + + _KVM_ARM_PREFERRED_TARGET = 0x8020aeaf + _KVM_ARM_VCPU_INIT = 0x4020aeae + _KVM_ARM64_REGS_PSTATE = 0x6030000000100042 + _KVM_ARM64_REGS_SP_EL1 = 0x6030000000100044 + _KVM_ARM64_REGS_R0 = 0x6030000000100000 + _KVM_ARM64_REGS_R1 = 0x6030000000100002 + _KVM_ARM64_REGS_R2 = 0x6030000000100004 + _KVM_ARM64_REGS_R3 = 0x6030000000100006 + _KVM_ARM64_REGS_R8 = 0x6030000000100010 + _KVM_ARM64_REGS_R18 = 0x6030000000100024 + _KVM_ARM64_REGS_PC = 0x6030000000100040 + _KVM_ARM64_REGS_MAIR_EL1 = 0x603000000013c510 + _KVM_ARM64_REGS_TCR_EL1 = 0x603000000013c102 + _KVM_ARM64_REGS_TTBR0_EL1 = 0x603000000013c100 + _KVM_ARM64_REGS_TTBR1_EL1 = 0x603000000013c101 + _KVM_ARM64_REGS_SCTLR_EL1 = 0x603000000013c080 + _KVM_ARM64_REGS_CPACR_EL1 = 0x603000000013c082 + _KVM_ARM64_REGS_VBAR_EL1 = 0x603000000013c600 +) + +// Arm64: Architectural Feature Access Control Register EL1. +const ( + _FPEN_NOTRAP = 0x3 + _FPEN_SHIFT = 0x20 +) + +// Arm64: System Control Register EL1. +const ( + _SCTLR_M = 1 << 0 + _SCTLR_C = 1 << 2 + _SCTLR_I = 1 << 12 +) + +// Arm64: Translation Control Register EL1. +const ( + _TCR_IPS_40BITS = 2 << 32 // PA=40 + _TCR_IPS_48BITS = 5 << 32 // PA=48 + + _TCR_T0SZ_OFFSET = 0 + _TCR_T1SZ_OFFSET = 16 + _TCR_IRGN0_SHIFT = 8 + _TCR_IRGN1_SHIFT = 24 + _TCR_ORGN0_SHIFT = 10 + _TCR_ORGN1_SHIFT = 26 + _TCR_SH0_SHIFT = 12 + _TCR_SH1_SHIFT = 28 + _TCR_TG0_SHIFT = 14 + _TCR_TG1_SHIFT = 30 + + _TCR_T0SZ_VA48 = 64 - 48 // VA=48 + _TCR_T1SZ_VA48 = 64 - 48 // VA=48 + + _TCR_ASID16 = 1 << 36 + _TCR_TBI0 = 1 << 37 + + _TCR_TXSZ_VA48 = (_TCR_T0SZ_VA48 << _TCR_T0SZ_OFFSET) | (_TCR_T1SZ_VA48 << _TCR_T1SZ_OFFSET) + + _TCR_TG0_4K = 0 << _TCR_TG0_SHIFT // 4K + _TCR_TG0_64K = 1 << _TCR_TG0_SHIFT // 64K + + _TCR_TG1_4K = 2 << _TCR_TG1_SHIFT + + _TCR_TG_FLAGS = _TCR_TG0_4K | _TCR_TG1_4K + + _TCR_IRGN0_WBWA = 1 << _TCR_IRGN0_SHIFT + _TCR_IRGN1_WBWA = 1 << _TCR_IRGN1_SHIFT + _TCR_IRGN_WBWA = _TCR_IRGN0_WBWA | _TCR_IRGN1_WBWA + + _TCR_ORGN0_WBWA = 1 << _TCR_ORGN0_SHIFT + _TCR_ORGN1_WBWA = 1 << _TCR_ORGN1_SHIFT + + _TCR_ORGN_WBWA = _TCR_ORGN0_WBWA | _TCR_ORGN1_WBWA + + _TCR_SHARED = (3 << _TCR_SH0_SHIFT) | (3 << _TCR_SH1_SHIFT) + + _TCR_CACHE_FLAGS = _TCR_IRGN_WBWA | _TCR_ORGN_WBWA +) + +// Arm64: Memory Attribute Indirection Register EL1. +const ( + _MT_DEVICE_nGnRnE = 0 + _MT_DEVICE_nGnRE = 1 + _MT_DEVICE_GRE = 2 + _MT_NORMAL_NC = 3 + _MT_NORMAL = 4 + _MT_NORMAL_WT = 5 + _MT_EL1_INIT = (0 << _MT_DEVICE_nGnRnE) | (0x4 << _MT_DEVICE_nGnRE * 8) | (0xc << _MT_DEVICE_GRE * 8) | (0x44 << _MT_NORMAL_NC * 8) | (0xff << _MT_NORMAL * 8) | (0xbb << _MT_NORMAL_WT * 8) +) + +const ( + _KVM_ARM_VCPU_POWER_OFF = 0 // CPU is started in OFF state + _KVM_ARM_VCPU_PSCI_0_2 = 2 // CPU uses PSCI v0.2 +) + +// Arm64: Exception Syndrome Register EL1. +const ( + _ESR_ELx_FSC = 0x3F + + _ESR_SEGV_MAPERR_L0 = 0x4 + _ESR_SEGV_MAPERR_L1 = 0x5 + _ESR_SEGV_MAPERR_L2 = 0x6 + _ESR_SEGV_MAPERR_L3 = 0x7 + + _ESR_SEGV_ACCERR_L1 = 0x9 + _ESR_SEGV_ACCERR_L2 = 0xa + _ESR_SEGV_ACCERR_L3 = 0xb + + _ESR_SEGV_PEMERR_L1 = 0xd + _ESR_SEGV_PEMERR_L2 = 0xe + _ESR_SEGV_PEMERR_L3 = 0xf +) diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index 61227cafb..7156c245f 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -135,3 +135,43 @@ func (c *vCPU) setSignalMask() error { } return nil } + +// setUserRegisters sets user registers in the vCPU. +func (c *vCPU) setUserRegisters(uregs *userRegs) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_REGS, + uintptr(unsafe.Pointer(uregs))); errno != 0 { + return fmt.Errorf("error setting user registers: %v", errno) + } + return nil +} + +// getUserRegisters reloads user registers in the vCPU. +// +// This is safe to call from a nosplit context. +// +//go:nosplit +func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_GET_REGS, + uintptr(unsafe.Pointer(uregs))); errno != 0 { + return errno + } + return 0 +} + +// setSystemRegisters sets system registers. +func (c *vCPU) setSystemRegisters(sregs *systemRegs) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_SREGS, + uintptr(unsafe.Pointer(sregs))); errno != 0 { + return fmt.Errorf("error setting system registers: %v", errno) + } + return nil +} diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index b7e2cfb9d..7ae47f291 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -12,8 +12,38 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build arm64 + package kvm +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +type vCPUArchState struct { + // PCIDs is the set of PCIDs for this vCPU. + // + // This starts above fixedKernelPCID. + PCIDs *pagetables.PCIDs +} + +const ( + // fixedKernelPCID is a fixed kernel PCID used for the kernel page + // tables. We must start allocating user PCIDs above this in order to + // avoid any conflict (see below). + fixedKernelPCID = 1 + + // poolPCIDs is the number of PCIDs to record in the database. As this + // grows, assignment can take longer, since it is a simple linear scan. + // Beyond a relatively small number, there are likely few perform + // benefits, since the TLB has likely long since lost any translations + // from more than a few PCIDs past. + poolPCIDs = 8 +) + // Get all read-only physicalRegions. func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { var rdonlyRegions []region @@ -59,3 +89,95 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) { return phyRegions } + +// dropPageTables drops cached page table entries. +func (m *machine) dropPageTables(pt *pagetables.PageTables) { + m.mu.Lock() + defer m.mu.Unlock() + + // Clear from all PCIDs. + for _, c := range m.vCPUs { + c.PCIDs.Drop(pt) + } +} + +// nonCanonical generates a canonical address return. +// +//go:nosplit +func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.AccessType, error) { + *info = arch.SignalInfo{ + Signo: signal, + Code: arch.SignalInfoKernel, + } + info.SetAddr(addr) // Include address. + return usermem.NoAccess, platform.ErrContextSignal +} + +// fault generates an appropriate fault return. +// +//go:nosplit +func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) { + faultAddr := c.GetFaultAddr() + code, user := c.ErrorCode() + + // Reset the pointed SignalInfo. + *info = arch.SignalInfo{Signo: signal} + info.SetAddr(uint64(faultAddr)) + + read := true + write := false + execute := true + + ret := code & _ESR_ELx_FSC + switch ret { + case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3: + info.Code = 1 //SEGV_MAPERR + read = false + write = true + execute = false + case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3: + info.Code = 2 // SEGV_ACCERR. + read = true + write = false + execute = false + default: + info.Code = 2 + } + + if !user { + read = true + write = false + execute = true + + } + accessType := usermem.AccessType{ + Read: read, + Write: write, + Execute: execute, + } + + return accessType, platform.ErrContextSignal +} + +// retryInGuest runs the given function in guest mode. +// +// If the function does not complete in guest mode (due to execution of a +// system call due to a GC stall, for example), then it will be retried. The +// given function must be idempotent as a result of the retry mechanism. +func (m *machine) retryInGuest(fn func()) { + c := m.Get() + defer m.Put(c) + for { + c.ClearErrorCode() // See below. + bluepill(c) // Force guest mode. + fn() // Execute the given function. + _, user := c.ErrorCode() + if user { + // If user is set, then we haven't bailed back to host + // mode via a kernel exception or system call. We + // consider the full function to have executed in guest + // mode and we can return. + break + } + } +} diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go new file mode 100644 index 000000000..3f2f97a6b --- /dev/null +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -0,0 +1,362 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "fmt" + "reflect" + "sync/atomic" + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// setMemoryRegion initializes a region. +// +// This may be called from bluepillHandler, and therefore returns an errno +// directly (instead of wrapping in an error) to avoid allocations. +// +//go:nosplit +func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno { + userRegion := userMemoryRegion{ + slot: uint32(slot), + flags: 0, + guestPhysAddr: uint64(physical), + memorySize: uint64(length), + userspaceAddr: uint64(virtual), + } + + // Set the region. + _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(m.fd), + _KVM_SET_USER_MEMORY_REGION, + uintptr(unsafe.Pointer(&userRegion))) + return errno +} + +type kvmVcpuInit struct { + target uint32 + features [7]uint32 +} + +var vcpuInit kvmVcpuInit + +// initArchState initializes architecture-specific state. +func (m *machine) initArchState() error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(m.fd), + _KVM_ARM_PREFERRED_TARGET, + uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { + panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno)) + } + return nil +} + +func getPageWithReflect(p uintptr) []byte { + return (*(*[0xFFFFFF]byte)(unsafe.Pointer(p & ^uintptr(syscall.Getpagesize()-1))))[:syscall.Getpagesize()] +} + +// Work around: move ring0.Vectors() into a specific address with 11-bits alignment. +// +// According to the design documentation of Arm64, +// the start address of exception vector table should be 11-bits aligned. +// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S +// But, we can't align a function's start address to a specific address by using golang. +// We have raised this question in golang community: +// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I +// This function will be removed when golang supports this feature. +// +// There are 2 jobs were implemented in this function: +// 1, move the start address of exception vector table into the specific address. +// 2, modify the offset of each instruction. +func updateVectorTable() { + fromLocation := reflect.ValueOf(ring0.Vectors).Pointer() + offset := fromLocation & (1<<11 - 1) + if offset != 0 { + offset = 1<<11 - offset + } + + toLocation := fromLocation + offset + page := getPageWithReflect(toLocation) + if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil { + panic(err) + } + + page = getPageWithReflect(toLocation + 4096) + if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil { + panic(err) + } + + // Move exception-vector-table into the specific address. + var entry *uint32 + var entryFrom *uint32 + for i := 1; i <= 0x800; i++ { + entry = (*uint32)(unsafe.Pointer(toLocation + 0x800 - uintptr(i))) + entryFrom = (*uint32)(unsafe.Pointer(fromLocation + 0x800 - uintptr(i))) + *entry = *entryFrom + } + + // The offset from the address of each unconditionally branch is changed. + // We should modify the offset of each instruction. + nums := []uint32{0x0, 0x80, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400, 0x480, 0x500, 0x580, 0x600, 0x680, 0x700, 0x780} + for _, num := range nums { + entry = (*uint32)(unsafe.Pointer(toLocation + uintptr(num))) + *entry = *entry - (uint32)(offset/4) + } + + page = getPageWithReflect(toLocation) + if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil { + panic(err) + } + + page = getPageWithReflect(toLocation + 4096) + if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil { + panic(err) + } +} + +// initArchState initializes architecture-specific state. +func (c *vCPU) initArchState() error { + var ( + reg kvmOneReg + data uint64 + regGet kvmOneReg + dataGet uint64 + ) + + reg.addr = uint64(reflect.ValueOf(&data).Pointer()) + regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer()) + + vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2) + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_ARM_VCPU_INIT, + uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { + panic(fmt.Sprintf("error setting KVM_ARM_VCPU_INIT failed: %v", errno)) + } + + // cpacr_el1 + reg.id = _KVM_ARM64_REGS_CPACR_EL1 + data = (_FPEN_NOTRAP << _FPEN_SHIFT) + if err := c.setOneRegister(®); err != nil { + return err + } + + // sctlr_el1 + regGet.id = _KVM_ARM64_REGS_SCTLR_EL1 + if err := c.getOneRegister(®Get); err != nil { + return err + } + + dataGet |= (_SCTLR_M | _SCTLR_C | _SCTLR_I) + data = dataGet + reg.id = _KVM_ARM64_REGS_SCTLR_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + // tcr_el1 + data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS + reg.id = _KVM_ARM64_REGS_TCR_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + // mair_el1 + data = _MT_EL1_INIT + reg.id = _KVM_ARM64_REGS_MAIR_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + // ttbr0_el1 + data = c.machine.kernel.PageTables.TTBR0_EL1(false, 0) + + reg.id = _KVM_ARM64_REGS_TTBR0_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + c.SetTtbr0Kvm(uintptr(data)) + + // ttbr1_el1 + data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0) + + reg.id = _KVM_ARM64_REGS_TTBR1_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + // sp_el1 + data = c.CPU.StackTop() + reg.id = _KVM_ARM64_REGS_SP_EL1 + if err := c.setOneRegister(®); err != nil { + return err + } + + // pc + reg.id = _KVM_ARM64_REGS_PC + data = uint64(reflect.ValueOf(ring0.Start).Pointer()) + if err := c.setOneRegister(®); err != nil { + return err + } + + // r8 + reg.id = _KVM_ARM64_REGS_R8 + data = uint64(reflect.ValueOf(&c.CPU).Pointer()) + if err := c.setOneRegister(®); err != nil { + return err + } + + // vbar_el1 + reg.id = _KVM_ARM64_REGS_VBAR_EL1 + + fromLocation := reflect.ValueOf(ring0.Vectors).Pointer() + offset := fromLocation & (1<<11 - 1) + if offset != 0 { + offset = 1<<11 - offset + } + + toLocation := fromLocation + offset + data = uint64(ring0.KernelStartAddress | toLocation) + if err := c.setOneRegister(®); err != nil { + return err + } + + data = ring0.PsrDefaultSet | ring0.KernelFlagsSet + reg.id = _KVM_ARM64_REGS_PSTATE + if err := c.setOneRegister(®); err != nil { + return err + } + + return nil +} + +//go:nosplit +func (c *vCPU) loadSegments(tid uint64) { + // TODO(gvisor.dev/issue/1238): TLS is not supported. + // Get TLS from tpidr_el0. + atomic.StoreUint64(&c.tid, tid) +} + +func (c *vCPU) setOneRegister(reg *kvmOneReg) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_ONE_REG, + uintptr(unsafe.Pointer(reg))); errno != 0 { + return fmt.Errorf("error setting one register: %v", errno) + } + return nil +} + +func (c *vCPU) getOneRegister(reg *kvmOneReg) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_GET_ONE_REG, + uintptr(unsafe.Pointer(reg))); errno != 0 { + return fmt.Errorf("error setting one register: %v", errno) + } + return nil +} + +// setCPUID sets the CPUID to be used by the guest. +func (c *vCPU) setCPUID() error { + return nil +} + +// setSystemTime sets the TSC for the vCPU. +func (c *vCPU) setSystemTime() error { + return nil +} + +// setSignalMask sets the vCPU signal mask. +// +// This must be called prior to running the vCPU. +func (c *vCPU) setSignalMask() error { + // The layout of this structure implies that it will not necessarily be + // the same layout chosen by the Go compiler. It gets fudged here. + var data struct { + length uint32 + mask1 uint32 + mask2 uint32 + _ uint32 + } + data.length = 8 // Fixed sigset size. + data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) + data.mask2 = ^uint32(bounceSignalMask >> 32) + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_SIGNAL_MASK, + uintptr(unsafe.Pointer(&data))); errno != 0 { + return fmt.Errorf("error setting signal mask: %v", errno) + } + + return nil +} + +// SwitchToUser unpacks architectural-details. +func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) { + // Check for canonical addresses. + if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) { + return nonCanonical(regs.Pc, int32(syscall.SIGSEGV), info) + } else if !ring0.IsCanonical(regs.Sp) { + return nonCanonical(regs.Sp, int32(syscall.SIGBUS), info) + } + + var vector ring0.Vector + ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0) + c.SetTtbr0App(uintptr(ttbr0App)) + + // TODO(gvisor.dev/issue/1238): full context-switch supporting for Arm64. + // The Arm64 user-mode execution state consists of: + // x0-x30 + // PC, SP, PSTATE + // V0-V31: 32 128-bit registers for floating point, and simd + // FPSR + // TPIDR_EL0, used for TLS + appRegs := switchOpts.Registers + c.SetAppAddr(ring0.KernelStartAddress | uintptr(unsafe.Pointer(appRegs))) + + entersyscall() + bluepill(c) + vector = c.CPU.SwitchToUser(switchOpts) + exitsyscall() + + switch vector { + case ring0.Syscall: + // Fast path: system call executed. + return usermem.NoAccess, nil + + case ring0.PageFault: + return c.fault(int32(syscall.SIGSEGV), info) + case 0xaa: + return usermem.NoAccess, nil + default: + return usermem.NoAccess, platform.ErrContextSignal + } + +} diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index ed9433311..f04be2ab5 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -87,46 +87,6 @@ func unmapRunData(r *runData) error { return nil } -// setUserRegisters sets user registers in the vCPU. -func (c *vCPU) setUserRegisters(uregs *userRegs) error { - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_SET_REGS, - uintptr(unsafe.Pointer(uregs))); errno != 0 { - return fmt.Errorf("error setting user registers: %v", errno) - } - return nil -} - -// getUserRegisters reloads user registers in the vCPU. -// -// This is safe to call from a nosplit context. -// -//go:nosplit -func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno { - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_GET_REGS, - uintptr(unsafe.Pointer(uregs))); errno != 0 { - return errno - } - return 0 -} - -// setSystemRegisters sets system registers. -func (c *vCPU) setSystemRegisters(sregs *systemRegs) error { - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_SET_SREGS, - uintptr(unsafe.Pointer(sregs))); errno != 0 { - return fmt.Errorf("error setting system registers: %v", errno) - } - return nil -} - // atomicAddressSpace is an atomic address space pointer. type atomicAddressSpace struct { pointer unsafe.Pointer diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go index 586e91bb2..91de5dab1 100644 --- a/pkg/sentry/platform/kvm/physical_map.go +++ b/pkg/sentry/platform/kvm/physical_map.go @@ -24,15 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usermem" ) -const ( - // reservedMemory is a chunk of physical memory reserved starting at - // physical address zero. There are some special pages in this region, - // so we just call the whole thing off. - // - // Other architectures may define this to be zero. - reservedMemory = 0x100000000 -) - type region struct { virtual uintptr length uintptr @@ -59,8 +50,7 @@ func fillAddressSpace() (excludedRegions []region) { // We can cut vSize in half, because the kernel will be using the top // half and we ignore it while constructing mappings. It's as if we've // already excluded half the possible addresses. - vSize := uintptr(1) << ring0.VirtualAddressBits() - vSize = vSize >> 1 + vSize := ring0.UserspaceSize // We exclude reservedMemory below from our physical memory size, so it // needs to be dropped here as well. Otherwise, we could end up with diff --git a/pkg/sentry/platform/kvm/physical_map_amd64.go b/pkg/sentry/platform/kvm/physical_map_amd64.go new file mode 100644 index 000000000..c5adfb577 --- /dev/null +++ b/pkg/sentry/platform/kvm/physical_map_amd64.go @@ -0,0 +1,22 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +const ( + // reservedMemory is a chunk of physical memory reserved starting at + // physical address zero. There are some special pages in this region, + // so we just call the whole thing off. + reservedMemory = 0x100000000 +) diff --git a/pkg/sentry/fsimpl/proc/proc.go b/pkg/sentry/platform/kvm/physical_map_arm64.go index 31dec36de..4d8561453 100644 --- a/pkg/sentry/fsimpl/proc/proc.go +++ b/pkg/sentry/platform/kvm/physical_map_arm64.go @@ -12,5 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package proc implements a partial in-memory file system for procfs. -package proc +package kvm + +const ( + reservedMemory = 0 +) diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s index 2cd28b2d2..0bebee852 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s @@ -50,6 +50,21 @@ TEXT ·SpinLoop(SB),NOSPLIT,$0 start: B start +TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 + NO_LOCAL_POINTERS + FMOVD $(9.9), F0 + MOVD $SYS_GETPID, R8 // getpid + SVC + FMOVD $(9.9), F1 + FCMPD F0, F1 + BNE isNaN + MOVD $1, R0 + MOVD R0, ret+0(FP) + RET +isNaN: + MOVD $0, ret+0(FP) + RET + // MVN: bitwise logical NOT // This case simulates an application that modified R0-R30. #define TWIDDLE_REGS() \ diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s index 64c718d21..16f9c523e 100644 --- a/pkg/sentry/platform/ptrace/stub_amd64.s +++ b/pkg/sentry/platform/ptrace/stub_amd64.s @@ -64,6 +64,8 @@ begin: CMPQ AX, $0 JL error + MOVQ $0, BX + // SIGSTOP to wait for attach. // // The SYSCALL instruction will be used for future syscall injection by @@ -73,23 +75,26 @@ begin: MOVQ $SIGSTOP, SI SYSCALL - // The tracer may "detach" and/or allow code execution here in three cases: - // - // 1. New (traced) stub threads are explicitly detached by the - // goroutine in newSubprocess. However, they are detached while in - // group-stop, so they do not execute code here. - // - // 2. If a tracer thread exits, it implicitly detaches from the stub, - // potentially allowing code execution here. However, the Go runtime - // never exits individual threads, so this case never occurs. - // - // 3. subprocess.createStub clones a new stub process that is untraced, + // The sentry sets BX to 1 when creating stub process. + CMPQ BX, $1 + JE clone + + // Notify the Sentry that syscall exited. +done: + INT $3 + // Be paranoid. + JMP done +clone: + // subprocess.createStub clones a new stub process that is untraced, // thus executing this code. We setup the PDEATHSIG before SIGSTOPing // ourselves for attach by the tracer. // // R15 has been updated with the expected PPID. - JMP begin + CMPQ AX, $0 + JE begin + // The clone syscall returns a non-zero value. + JMP done error: // Exit with -errno. MOVQ AX, DI diff --git a/pkg/sentry/platform/ptrace/stub_arm64.s b/pkg/sentry/platform/ptrace/stub_arm64.s index 2c5e4d5cb..6162df02a 100644 --- a/pkg/sentry/platform/ptrace/stub_arm64.s +++ b/pkg/sentry/platform/ptrace/stub_arm64.s @@ -59,6 +59,8 @@ begin: CMP $0x0, R0 BLT error + MOVD $0, R9 + // SIGSTOP to wait for attach. // // The SYSCALL instruction will be used for future syscall injection by @@ -66,22 +68,26 @@ begin: MOVD $SYS_KILL, R8 MOVD $SIGSTOP, R1 SVC - // The tracer may "detach" and/or allow code execution here in three cases: - // - // 1. New (traced) stub threads are explicitly detached by the - // goroutine in newSubprocess. However, they are detached while in - // group-stop, so they do not execute code here. - // - // 2. If a tracer thread exits, it implicitly detaches from the stub, - // potentially allowing code execution here. However, the Go runtime - // never exits individual threads, so this case never occurs. - // - // 3. subprocess.createStub clones a new stub process that is untraced, + + // The sentry sets R9 to 1 when creating stub process. + CMP $1, R9 + BEQ clone + +done: + // Notify the Sentry that syscall exited. + BRK $3 + B done // Be paranoid. +clone: + // subprocess.createStub clones a new stub process that is untraced, // thus executing this code. We setup the PDEATHSIG before SIGSTOPing // ourselves for attach by the tracer. // // R7 has been updated with the expected PPID. - B begin + CMP $0, R0 + BEQ begin + + // The clone system call returned a non-zero value. + B done error: // Exit with -errno. diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index ddb1f41e3..20244fd95 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -21,6 +21,7 @@ import ( "sync" "syscall" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -429,13 +430,15 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { } for { - // Execute the syscall instruction. - if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { + // Execute the syscall instruction. The task has to stop on the + // trap instruction which is right after the syscall + // instruction. + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) } sig := t.wait(stopped) - if sig == (syscallEvent | syscall.SIGTRAP) { + if sig == syscall.SIGTRAP { // Reached syscall-enter-stop. break } else { @@ -447,18 +450,6 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { } } - // Complete the actual system call. - if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { - panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) - } - - // Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens - // between syscall-enter-stop and syscall-exit-stop; it happens *after* - // syscall-exit-stop.)" - ptrace(2), "Syscall-stops" - if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) { - t.dumpAndPanic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig)) - } - // Grab registers. if err := t.getRegs(regs); err != nil { panic(fmt.Sprintf("ptrace get regs failed: %v", err)) @@ -541,14 +532,14 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { if isSingleStepping(regs) { if _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, - syscall.PTRACE_SYSEMU_SINGLESTEP, + unix.PTRACE_SYSEMU_SINGLESTEP, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) } } else { if _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, - syscall.PTRACE_SYSEMU, + unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) } diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 4649a94a7..e99798c56 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -21,6 +21,8 @@ import ( "strings" "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" ) @@ -139,7 +141,55 @@ func (t *thread) adjustInitRegsRip() { t.initRegs.Rip -= initRegsRipAdjustment } -// Pass the expected PPID to the child via R15 when creating stub process +// Pass the expected PPID to the child via R15 when creating stub process. func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { initregs.R15 = uint64(ppid) + // Rbx has to be set to 1 when creating stub process. + initregs.Rbx = 1 +} + +// patchSignalInfo patches the signal info to account for hitting the seccomp +// filters from vsyscall emulation, specified below. We allow for SIGSYS as a +// synchronous trap, but patch the structure to appear like a SIGSEGV with the +// Rip as the faulting address. +// +// Note that this should only be called after verifying that the signalInfo has +// been generated by the kernel. +func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) { + if linux.Signal(signalInfo.Signo) == linux.SIGSYS { + signalInfo.Signo = int32(linux.SIGSEGV) + + // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered + // with the si_call_addr field pointing to the current RIP. This field + // aligns with the si_addr field for a SIGSEGV, so we don't need to touch + // anything there. We do need to unwind emulation however, so we set the + // instruction pointer to the faulting value, and "unpop" the stack. + regs.Rip = signalInfo.Addr() + regs.Rsp -= 8 + } +} + +// enableCpuidFault enables cpuid-faulting. +// +// This may fail on older kernels or hardware, so we just disregard the result. +// Host CPUID will be enabled. +// +// This is safe to call in an afterFork context. +// +//go:nosplit +func enableCpuidFault() { + syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0) +} + +// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. +// Ref attachedThread() for more detail. +func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet { + return append(rules, seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ + {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, + }, + }, + Action: linux.SECCOMP_RET_ALLOW, + }) } diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go index bec884ba5..7b975137f 100644 --- a/pkg/sentry/platform/ptrace/subprocess_arm64.go +++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go @@ -17,8 +17,12 @@ package ptrace import ( + "fmt" + "strings" "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" ) @@ -37,7 +41,7 @@ const ( // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. -func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) { +func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) { } // createSyscallRegs sets up syscall registers. @@ -123,4 +127,39 @@ func (t *thread) adjustInitRegsRip() { // Pass the expected PPID to the child via X7 when creating stub process func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { initregs.Regs[7] = uint64(ppid) + // R9 has to be set to 1 when creating stub process. + initregs.Regs[9] = 1 +} + +// patchSignalInfo patches the signal info to account for hitting the seccomp +// filters from vsyscall emulation, specified below. We allow for SIGSYS as a +// synchronous trap, but patch the structure to appear like a SIGSEGV with the +// Rip as the faulting address. +// +// Note that this should only be called after verifying that the signalInfo has +// been generated by the kernel. +func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) { + if linux.Signal(signalInfo.Signo) == linux.SIGSYS { + signalInfo.Signo = int32(linux.SIGSEGV) + + // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered + // with the si_call_addr field pointing to the current RIP. This field + // aligns with the si_addr field for a SIGSEGV, so we don't need to touch + // anything there. We do need to unwind emulation however, so we set the + // instruction pointer to the faulting value, and "unpop" the stack. + regs.Pc = signalInfo.Addr() + regs.Sp -= 8 + } +} + +// Noop on arm64. +// +//go:nosplit +func enableCpuidFault() { +} + +// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. +// Ref attachedThread() for more detail. +func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet { + return rules } diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index 3782d4332..74968dfdf 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -20,6 +20,7 @@ import ( "fmt" "syscall" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" @@ -53,7 +54,7 @@ func probeSeccomp() bool { for { // Attempt an emulation. - if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) } @@ -77,27 +78,6 @@ func probeSeccomp() bool { } } -// patchSignalInfo patches the signal info to account for hitting the seccomp -// filters from vsyscall emulation, specified below. We allow for SIGSYS as a -// synchronous trap, but patch the structure to appear like a SIGSEGV with the -// Rip as the faulting address. -// -// Note that this should only be called after verifying that the signalInfo has -// been generated by the kernel. -func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) { - if linux.Signal(signalInfo.Signo) == linux.SIGSYS { - signalInfo.Signo = int32(linux.SIGSEGV) - - // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered - // with the si_call_addr field pointing to the current RIP. This field - // aligns with the si_addr field for a SIGSEGV, so we don't need to touch - // anything there. We do need to unwind emulation however, so we set the - // instruction pointer to the faulting value, and "unpop" the stack. - regs.Rip = signalInfo.Addr() - regs.Rsp -= 8 - } -} - // createStub creates a fresh stub processes. // // Precondition: the runtime OS thread must be locked. @@ -149,7 +129,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro Rules: seccomp.SyscallRules{ syscall.SYS_GETTIMEOFDAY: {}, syscall.SYS_TIME: {}, - 309: {}, // SYS_GETCPU. + unix.SYS_GETCPU: {}, // SYS_GETCPU was not defined in package syscall on amd64. }, Action: linux.SECCOMP_RET_TRAP, Vsyscall: true, @@ -173,10 +153,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro // For the initial process creation. syscall.SYS_WAIT4: {}, - syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ - {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, - }, - syscall.SYS_EXIT: {}, + syscall.SYS_EXIT: {}, // For the stub prctl dance (all). syscall.SYS_PRCTL: []seccomp.Rule{ @@ -196,6 +173,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro }, Action: linux.SECCOMP_RET_ALLOW, }) + + rules = appendArchSeccompRules(rules) } instrs, err := seccomp.BuildProgram(rules, defaultAction) if err != nil { @@ -267,9 +246,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0) } - // Enable cpuid-faulting; this may fail on older kernels or hardware, - // so we just disregard the result. Host CPUID will be enabled. - syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0) + // Enable cpuid-faulting. + enableCpuidFault() // Call the stub; should not return. stubCall(stubStart, ppid) diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD index f1af18265..87f4552b5 100644 --- a/pkg/sentry/platform/ring0/BUILD +++ b/pkg/sentry/platform/ring0/BUILD @@ -71,6 +71,7 @@ go_library( "lib_amd64.go", "lib_amd64.s", "lib_arm64.go", + "lib_arm64.s", "ring0.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0", diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go index fbfbd9bab..dc0eeec01 100644 --- a/pkg/sentry/platform/ring0/defs_arm64.go +++ b/pkg/sentry/platform/ring0/defs_arm64.go @@ -73,6 +73,9 @@ type CPUArchState struct { // application context pointer. appAddr uintptr + + // lazyVFP is the value of cpacr_el1. + lazyVFP uintptr } // ErrorCode returns the last error code. diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s index 674c569e1..64e9c0845 100644 --- a/pkg/sentry/platform/ring0/entry_arm64.s +++ b/pkg/sentry/platform/ring0/entry_arm64.s @@ -31,6 +31,11 @@ #define RSV_REG R18_PLATFORM #define RSV_REG_APP R9 +#define FPEN_NOTRAP 0x3 +#define FPEN_SHIFT 20 + +#define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT) + #define REGISTERS_SAVE(reg, offset) \ MOVD R0, offset+PTRACE_R0(reg); \ MOVD R1, offset+PTRACE_R1(reg); \ @@ -279,6 +284,16 @@ #define IRQ_DISABLE \ MSR $2, DAIFClr; +#define VFP_ENABLE \ + MOVD $FPEN_ENABLE, R0; \ + WORD $0xd5181040; \ //MSR R0, CPACR_EL1 + ISB $15; + +#define VFP_DISABLE \ + MOVD $0x0, R0; \ + WORD $0xd5181040; \ //MSR R0, CPACR_EL1 + ISB $15; + #define KERNEL_ENTRY_FROM_EL0 \ SUB $16, RSP, RSP; \ // step1, save r18, r9 into kernel temporary stack. STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \ @@ -318,6 +333,11 @@ TEXT ·Halt(SB),NOSPLIT,$0 BNE mmio_exit MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG) mmio_exit: + // Disable fpsimd. + WORD $0xd5381041 // MRS CPACR_EL1, R1 + MOVD R1, CPU_LAZY_VFP(RSV_REG) + VFP_DISABLE + // MMIO_EXIT. MOVD $0, R9 MOVD R0, 0xffff000000001000(R9) @@ -449,6 +469,8 @@ TEXT ·El1_sync(SB),NOSPLIT,$0 BEQ el1_svc CMP $ESR_ELx_EC_BREAKPT_CUR, R24 BGE el1_dbg + CMP $ESR_ELx_EC_FP_ASIMD, R24 + BEQ el1_fpsimd_acc B el1_invalid el1_da: @@ -469,6 +491,10 @@ el1_svc: el1_dbg: B ·Shutdown(SB) +el1_fpsimd_acc: + VFP_ENABLE + B ·kernelExitToEl1(SB) // Resume. + el1_invalid: B ·Shutdown(SB) @@ -595,6 +621,10 @@ TEXT ·Vectors(SB),NOSPLIT,$0 B ·El0_error_invalid(SB) nop31Instructions() + // The exception-vector-table is required to be 11-bits aligned. + // Please see Linux source code as reference: arch/arm64/kernel/entry.s. + // For gvisor, I defined it as 4K in length, filled the 2nd 2K part with NOPs. + // So that, I can safely move the 1st 2K part into the address with 11-bits alignment. WORD $0xd503201f //nop nop31Instructions() WORD $0xd503201f diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go index 900ee6380..8bcfe1032 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.go +++ b/pkg/sentry/platform/ring0/lib_arm64.go @@ -16,10 +16,24 @@ package ring0 -// LoadFloatingPoint loads floating point state by the most efficient mechanism -// available (set by Init). -var LoadFloatingPoint func(*byte) +// CPACREL1 returns the value of the CPACR_EL1 register. +func CPACREL1() (value uintptr) -// SaveFloatingPoint saves floating point state by the most efficient mechanism -// available (set by Init). -var SaveFloatingPoint func(*byte) +// FPCR returns the value of FPCR register. +func FPCR() (value uintptr) + +// SetFPCR writes the FPCR value. +func SetFPCR(value uintptr) + +// FPSR returns the value of FPSR register. +func FPSR() (value uintptr) + +// SetFPSR writes the FPSR value. +func SetFPSR(value uintptr) + +// SaveVRegs saves V0-V31 registers. +// V0-V31: 32 128-bit registers for floating point and simd. +func SaveVRegs(*byte) + +// LoadVRegs loads V0-V31 registers. +func LoadVRegs(*byte) diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s new file mode 100644 index 000000000..0e6a6235b --- /dev/null +++ b/pkg/sentry/platform/ring0/lib_arm64.s @@ -0,0 +1,121 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "funcdata.h" +#include "textflag.h" + +TEXT ·CPACREL1(SB),NOSPLIT,$0-8 + WORD $0xd5381041 // MRS CPACR_EL1, R1 + MOVD R1, ret+0(FP) + RET + +TEXT ·GetFPCR(SB),NOSPLIT,$0-8 + WORD $0xd53b4201 // MRS NZCV, R1 + MOVD R1, ret+0(FP) + RET + +TEXT ·GetFPSR(SB),NOSPLIT,$0-8 + WORD $0xd53b4421 // MRS FPSR, R1 + MOVD R1, ret+0(FP) + RET + +TEXT ·SetFPCR(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R1 + WORD $0xd51b4201 // MSR R1, NZCV + RET + +TEXT ·SetFPSR(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R1 + WORD $0xd51b4421 // MSR R1, FPSR + RET + +TEXT ·SaveVRegs(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + // Skip aarch64_ctx, fpsr, fpcr. + FMOVD F0, 16*1(R0) + FMOVD F1, 16*2(R0) + FMOVD F2, 16*3(R0) + FMOVD F3, 16*4(R0) + FMOVD F4, 16*5(R0) + FMOVD F5, 16*6(R0) + FMOVD F6, 16*7(R0) + FMOVD F7, 16*8(R0) + FMOVD F8, 16*9(R0) + FMOVD F9, 16*10(R0) + FMOVD F10, 16*11(R0) + FMOVD F11, 16*12(R0) + FMOVD F12, 16*13(R0) + FMOVD F13, 16*14(R0) + FMOVD F14, 16*15(R0) + FMOVD F15, 16*16(R0) + FMOVD F16, 16*17(R0) + FMOVD F17, 16*18(R0) + FMOVD F18, 16*19(R0) + FMOVD F19, 16*20(R0) + FMOVD F20, 16*21(R0) + FMOVD F21, 16*22(R0) + FMOVD F22, 16*23(R0) + FMOVD F23, 16*24(R0) + FMOVD F24, 16*25(R0) + FMOVD F25, 16*26(R0) + FMOVD F26, 16*27(R0) + FMOVD F27, 16*28(R0) + FMOVD F28, 16*29(R0) + FMOVD F29, 16*30(R0) + FMOVD F30, 16*31(R0) + FMOVD F31, 16*32(R0) + ISB $15 + + RET + +TEXT ·LoadVRegs(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + // Skip aarch64_ctx, fpsr, fpcr. + FMOVD 16*1(R0), F0 + FMOVD 16*2(R0), F1 + FMOVD 16*3(R0), F2 + FMOVD 16*4(R0), F3 + FMOVD 16*5(R0), F4 + FMOVD 16*6(R0), F5 + FMOVD 16*7(R0), F6 + FMOVD 16*8(R0), F7 + FMOVD 16*9(R0), F8 + FMOVD 16*10(R0), F9 + FMOVD 16*11(R0), F10 + FMOVD 16*12(R0), F11 + FMOVD 16*13(R0), F12 + FMOVD 16*14(R0), F13 + FMOVD 16*15(R0), F14 + FMOVD 16*16(R0), F15 + FMOVD 16*17(R0), F16 + FMOVD 16*18(R0), F17 + FMOVD 16*19(R0), F18 + FMOVD 16*20(R0), F19 + FMOVD 16*21(R0), F20 + FMOVD 16*22(R0), F21 + FMOVD 16*23(R0), F22 + FMOVD 16*24(R0), F23 + FMOVD 16*25(R0), F24 + FMOVD 16*26(R0), F25 + FMOVD 16*27(R0), F26 + FMOVD 16*28(R0), F27 + FMOVD 16*29(R0), F28 + FMOVD 16*30(R0), F29 + FMOVD 16*31(R0), F30 + FMOVD 16*32(R0), F31 + ISB $15 + + RET diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go index d7aa1c7cc..cd2a65f97 100644 --- a/pkg/sentry/platform/ring0/offsets_arm64.go +++ b/pkg/sentry/platform/ring0/offsets_arm64.go @@ -39,6 +39,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define CPU_TTBR0_APP 0x%02x\n", reflect.ValueOf(&c.ttbr0App).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_VECTOR_CODE 0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_APP_ADDR 0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_LAZY_VFP 0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "\n// Bits.\n") fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet) diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go index 2f65db70b..ba1f9043d 100644 --- a/pkg/sentry/sighandling/sighandling.go +++ b/pkg/sentry/sighandling/sighandling.go @@ -16,7 +16,6 @@ package sighandling import ( - "fmt" "os" "os/signal" "reflect" @@ -31,37 +30,25 @@ const numSignals = 32 // handleSignals listens for incoming signals and calls the given handler // function. // -// It starts when the start channel is closed, stops when the stop channel -// is closed, and closes done once it will no longer deliver signals to k. -func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start, stop, done chan struct{}) { +// It stops when the stop channel is closed. The done channel is closed once it +// will no longer deliver signals to k. +func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), stop, done chan struct{}) { // Build a select case. - sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(start)}} + sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}} for _, sigchan := range sigchans { sc = append(sc, reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(sigchan)}) } - started := false for { // Wait for a notification. index, _, ok := reflect.Select(sc) - // Was it the start / stop channel? + // Was it the stop channel? if index == 0 { if !ok { - if !started { - // start channel; start forwarding and - // swap this case for the stop channel - // to select stop requests. - started = true - sc[0] = reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)} - } else { - // stop channel; stop forwarding and - // clear this case so it is never - // selected again. - started = false - close(done) - sc[0].Chan = reflect.Value{} - } + // Stop forwarding and notify that it's done. + close(done) + return } continue } @@ -73,44 +60,17 @@ func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start, // Otherwise, it was a signal on channel N. Index 0 represents the stop // channel, so index N represents the channel for signal N. - signal := linux.Signal(index) - - if !started { - // Kernel cannot receive signals, either because it is - // not ready yet or is shutting down. - // - // Kill ourselves if this signal would have killed the - // process before PrepareForwarding was called. i.e., all - // _SigKill signals; see Go - // src/runtime/sigtab_linux_generic.go. - // - // Otherwise ignore the signal. - // - // TODO(b/114489875): Drop in Go 1.12, which uses tgkill - // in runtime.raise. - switch signal { - case linux.SIGHUP, linux.SIGINT, linux.SIGTERM: - dieFromSignal(signal) - panic(fmt.Sprintf("Failed to die from signal %d", signal)) - default: - continue - } - } - - // Pass the signal to the handler. - handler(signal) + handler(linux.Signal(index)) } } -// PrepareHandler ensures that synchronous signals are passed to the given -// handler function and returns a callback that starts signal delivery, which -// itself returns a callback that stops signal handling. +// StartSignalForwarding ensures that synchronous signals are passed to the +// given handler function and returns a callback that stops signal delivery. // // Note that this function permanently takes over signal handling. After the // stop callback, signals revert to the default Go runtime behavior, which // cannot be overridden with external calls to signal.Notify. -func PrepareHandler(handler func(linux.Signal)) func() func() { - start := make(chan struct{}) +func StartSignalForwarding(handler func(linux.Signal)) func() { stop := make(chan struct{}) done := make(chan struct{}) @@ -128,13 +88,10 @@ func PrepareHandler(handler func(linux.Signal)) func() func() { signal.Notify(sigchan, syscall.Signal(sig)) } // Start up our listener. - go handleSignals(sigchans, handler, start, stop, done) // S/R-SAFE: synchronized by Kernel.extMu. + go handleSignals(sigchans, handler, stop, done) // S/R-SAFE: synchronized by Kernel.extMu. - return func() func() { - close(start) - return func() { - close(stop) - <-done - } + return func() { + close(stop) + <-done } } diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go index c303435d5..1ebe22d34 100644 --- a/pkg/sentry/sighandling/sighandling_unsafe.go +++ b/pkg/sentry/sighandling/sighandling_unsafe.go @@ -15,8 +15,6 @@ package sighandling import ( - "fmt" - "runtime" "syscall" "unsafe" @@ -48,27 +46,3 @@ func IgnoreChildStop() error { return nil } - -// dieFromSignal kills the current process with sig. -// -// Preconditions: The default action of sig is termination. -func dieFromSignal(sig linux.Signal) { - runtime.LockOSThread() - defer runtime.UnlockOSThread() - - sa := sigaction{handler: linux.SIG_DFL} - if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, linux.SignalSetSize, 0, 0); e != 0 { - panic(fmt.Sprintf("rt_sigaction failed: %v", e)) - } - - set := linux.MakeSignalSet(sig) - if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_UNBLOCK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0); e != 0 { - panic(fmt.Sprintf("rt_sigprocmask failed: %v", e)) - } - - if err := syscall.Tgkill(syscall.Getpid(), syscall.Gettid(), syscall.Signal(sig)); err != nil { - panic(fmt.Sprintf("tgkill failed: %v", err)) - } - - panic("failed to die") -} diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 4a6e83a8b..357517ed4 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -17,6 +17,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", "//pkg/syserror", diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 4e95101b7..af1a4e95f 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" @@ -194,15 +195,15 @@ func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([ // the available space, we must align down. // // align must be >= 4 and each data int32 is 4 bytes. The length of the - // header is already aligned, so if we align to the with of the data there + // header is already aligned, so if we align to the width of the data there // are two cases: // 1. The aligned length is less than the length of the header. The // unaligned length was also less than the length of the header, so we // can't write anything. // 2. The aligned length is greater than or equal to the length of the - // header. We can write the header plus zero or more datas. We can't write - // a partial int32, so the length of the message will be - // min(aligned length, header + datas). + // header. We can write the header plus zero or more bytes of data. We can't + // write a partial int32, so the length of the message will be + // min(aligned length, header + data). if space < linux.SizeOfControlMessageHeader { flags |= linux.MSG_CTRUNC return buf, flags @@ -239,12 +240,12 @@ func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data interf buf = binary.Marshal(buf, usermem.ByteOrder, data) - // Check if we went over. + // If the control message data brought us over capacity, omit it. if cap(buf) != cap(ob) { return hdrBuf } - // Fix up length. + // Update control message length to include data. putUint64(ob, uint64(len(buf)-len(ob))) return alignSlice(buf, align) @@ -320,35 +321,109 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte { buf, linux.SOL_TCP, linux.TCP_INQ, - 4, + t.Arch().Width(), inq, ) } +// PackTOS packs an IP_TOS socket control message. +func PackTOS(t *kernel.Task, tos int8, buf []byte) []byte { + return putCmsgStruct( + buf, + linux.SOL_IP, + linux.IP_TOS, + t.Arch().Width(), + tos, + ) +} + +// PackTClass packs an IPV6_TCLASS socket control message. +func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte { + return putCmsgStruct( + buf, + linux.SOL_IPV6, + linux.IPV6_TCLASS, + t.Arch().Width(), + tClass, + ) +} + +// PackControlMessages packs control messages into the given buffer. +// +// We skip control messages specific to Unix domain sockets. +// +// Note that some control messages may be truncated if they do not fit under +// the capacity of buf. +func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byte) []byte { + if cmsgs.IP.HasTimestamp { + buf = PackTimestamp(t, cmsgs.IP.Timestamp, buf) + } + + if cmsgs.IP.HasInq { + // In Linux, TCP_CM_INQ is added after SO_TIMESTAMP. + buf = PackInq(t, cmsgs.IP.Inq, buf) + } + + if cmsgs.IP.HasTOS { + buf = PackTOS(t, cmsgs.IP.TOS, buf) + } + + if cmsgs.IP.HasTClass { + buf = PackTClass(t, cmsgs.IP.TClass, buf) + } + + return buf +} + +// cmsgSpace is equivalent to CMSG_SPACE in Linux. +func cmsgSpace(t *kernel.Task, dataLen int) int { + return linux.SizeOfControlMessageHeader + AlignUp(dataLen, t.Arch().Width()) +} + +// CmsgsSpace returns the number of bytes needed to fit the control messages +// represented in cmsgs. +func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int { + space := 0 + + if cmsgs.IP.HasTimestamp { + space += cmsgSpace(t, linux.SizeOfTimeval) + } + + if cmsgs.IP.HasInq { + space += cmsgSpace(t, linux.SizeOfControlMessageInq) + } + + if cmsgs.IP.HasTOS { + space += cmsgSpace(t, linux.SizeOfControlMessageTOS) + } + + if cmsgs.IP.HasTClass { + space += cmsgSpace(t, linux.SizeOfControlMessageTClass) + } + + return space +} + // Parse parses a raw socket control message into portable objects. -func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.ControlMessages, error) { +func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) { var ( - fds linux.ControlMessageRights - haveCreds bool - creds linux.ControlMessageCredentials + cmsgs socket.ControlMessages + fds linux.ControlMessageRights ) for i := 0; i < len(buf); { if i+linux.SizeOfControlMessageHeader > len(buf) { - return transport.ControlMessages{}, syserror.EINVAL + return cmsgs, syserror.EINVAL } var h linux.ControlMessageHeader binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageHeader], usermem.ByteOrder, &h) if h.Length < uint64(linux.SizeOfControlMessageHeader) { - return transport.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, syserror.EINVAL } if h.Length > uint64(len(buf)-i) { - return transport.ControlMessages{}, syserror.EINVAL - } - if h.Level != linux.SOL_SOCKET { - return transport.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, syserror.EINVAL } i += linux.SizeOfControlMessageHeader @@ -358,59 +433,79 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport. // sizeof(long) in CMSG_ALIGN. width := t.Arch().Width() - switch h.Type { - case linux.SCM_RIGHTS: - rightsSize := AlignDown(length, linux.SizeOfControlMessageRight) - numRights := rightsSize / linux.SizeOfControlMessageRight - - if len(fds)+numRights > linux.SCM_MAX_FD { - return transport.ControlMessages{}, syserror.EINVAL + switch h.Level { + case linux.SOL_SOCKET: + switch h.Type { + case linux.SCM_RIGHTS: + rightsSize := AlignDown(length, linux.SizeOfControlMessageRight) + numRights := rightsSize / linux.SizeOfControlMessageRight + + if len(fds)+numRights > linux.SCM_MAX_FD { + return socket.ControlMessages{}, syserror.EINVAL + } + + for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight { + fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight]))) + } + + i += AlignUp(length, width) + + case linux.SCM_CREDENTIALS: + if length < linux.SizeOfControlMessageCredentials { + return socket.ControlMessages{}, syserror.EINVAL + } + + var creds linux.ControlMessageCredentials + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds) + scmCreds, err := NewSCMCredentials(t, creds) + if err != nil { + return socket.ControlMessages{}, err + } + cmsgs.Unix.Credentials = scmCreds + i += AlignUp(length, width) + + default: + // Unknown message type. + return socket.ControlMessages{}, syserror.EINVAL } - - for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight { - fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight]))) + case linux.SOL_IP: + switch h.Type { + case linux.IP_TOS: + cmsgs.IP.HasTOS = true + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS) + i += AlignUp(length, width) + + default: + return socket.ControlMessages{}, syserror.EINVAL } - - i += AlignUp(length, width) - - case linux.SCM_CREDENTIALS: - if length < linux.SizeOfControlMessageCredentials { - return transport.ControlMessages{}, syserror.EINVAL + case linux.SOL_IPV6: + switch h.Type { + case linux.IPV6_TCLASS: + cmsgs.IP.HasTClass = true + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass) + i += AlignUp(length, width) + + default: + return socket.ControlMessages{}, syserror.EINVAL } - - binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds) - haveCreds = true - i += AlignUp(length, width) - default: - // Unknown message type. - return transport.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, syserror.EINVAL } } - var credentials SCMCredentials - if haveCreds { - var err error - if credentials, err = NewSCMCredentials(t, creds); err != nil { - return transport.ControlMessages{}, err - } - } else { - credentials = makeCreds(t, socketOrEndpoint) + if cmsgs.Unix.Credentials == nil { + cmsgs.Unix.Credentials = makeCreds(t, socketOrEndpoint) } - var rights SCMRights if len(fds) > 0 { - var err error - if rights, err = NewSCMRights(t, fds); err != nil { - return transport.ControlMessages{}, err + rights, err := NewSCMRights(t, fds) + if err != nil { + return socket.ControlMessages{}, err } + cmsgs.Unix.Rights = rights } - if credentials == nil && rights == nil { - return transport.ControlMessages{}, nil - } - - return transport.ControlMessages{Credentials: credentials, Rights: rights}, nil + return cmsgs, nil } func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials { diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 8b66a719d..4c44c7c0f 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -29,10 +29,12 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", + "//pkg/sentry/socket/control", "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip/stack", "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 92beb1bcf..c957b0f1d 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -18,6 +18,7 @@ import ( "fmt" "syscall" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/fdnotifier" @@ -29,6 +30,7 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" @@ -41,6 +43,10 @@ const ( // sizeofSockaddr is the size in bytes of the largest sockaddr type // supported by this package. sizeofSockaddr = syscall.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in) + + // maxControlLen is the maximum size of a control message buffer used in a + // recvmsg or sendmsg syscall. + maxControlLen = 1024 ) // socketOperations implements fs.FileOperations and socket.Socket for a socket @@ -281,26 +287,32 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt // Whitelist options and constrain option length. var optlen int switch level { - case syscall.SOL_IPV6: + case linux.SOL_IP: + switch name { + case linux.IP_TOS, linux.IP_RECVTOS: + optlen = sizeofInt32 + } + case linux.SOL_IPV6: switch name { - case syscall.IPV6_V6ONLY: + case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY: optlen = sizeofInt32 } - case syscall.SOL_SOCKET: + case linux.SOL_SOCKET: switch name { - case syscall.SO_ERROR, syscall.SO_KEEPALIVE, syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR: + case linux.SO_ERROR, linux.SO_KEEPALIVE, linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR: optlen = sizeofInt32 - case syscall.SO_LINGER: + case linux.SO_LINGER: optlen = syscall.SizeofLinger } - case syscall.SOL_TCP: + case linux.SOL_TCP: switch name { - case syscall.TCP_NODELAY: + case linux.TCP_NODELAY: optlen = sizeofInt32 - case syscall.TCP_INFO: + case linux.TCP_INFO: optlen = int(linux.SizeOfTCPInfo) } } + if optlen == 0 { return nil, syserr.ErrProtocolNotAvailable // ENOPROTOOPT } @@ -320,19 +332,24 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [ // Whitelist options and constrain option length. var optlen int switch level { - case syscall.SOL_IPV6: + case linux.SOL_IP: + switch name { + case linux.IP_TOS, linux.IP_RECVTOS: + optlen = sizeofInt32 + } + case linux.SOL_IPV6: switch name { - case syscall.IPV6_V6ONLY: + case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY: optlen = sizeofInt32 } - case syscall.SOL_SOCKET: + case linux.SOL_SOCKET: switch name { - case syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR: + case linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR: optlen = sizeofInt32 } - case syscall.SOL_TCP: + case linux.SOL_TCP: switch name { - case syscall.TCP_NODELAY: + case linux.TCP_NODELAY: optlen = sizeofInt32 } } @@ -354,11 +371,11 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [ } // RecvMsg implements socket.Socket.RecvMsg. -func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { +func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { // Whitelist flags. // // FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary - // messages that netstack/tcpip/transport/unix doesn't understand. Kill the + // messages that gvisor/pkg/tcpip/transport/unix doesn't understand. Kill the // Socket interface's dependence on netstack. if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument @@ -370,6 +387,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags senderAddrBuf = make([]byte, sizeofSockaddr) } + var controlBuf []byte var msgFlags int recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { @@ -384,11 +402,6 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags // We always do a non-blocking recv*(). sysflags := flags | syscall.MSG_DONTWAIT - if dsts.NumBlocks() == 1 { - // Skip allocating []syscall.Iovec. - return recvfrom(s.fd, dsts.Head().ToSlice(), sysflags, &senderAddrBuf) - } - iovs := iovecsFromBlockSeq(dsts) msg := syscall.Msghdr{ Iov: &iovs[0], @@ -398,12 +411,21 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags msg.Name = &senderAddrBuf[0] msg.Namelen = uint32(len(senderAddrBuf)) } + if controlLen > 0 { + if controlLen > maxControlLen { + controlLen = maxControlLen + } + controlBuf = make([]byte, controlLen) + msg.Control = &controlBuf[0] + msg.Controllen = controlLen + } n, err := recvmsg(s.fd, &msg, sysflags) if err != nil { return 0, err } senderAddrBuf = senderAddrBuf[:msg.Namelen] msgFlags = int(msg.Flags) + controlLen = uint64(msg.Controllen) return n, nil }) @@ -429,14 +451,38 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags n, err = dst.CopyOutFrom(t, recvmsgToBlocks) } } - - // We don't allow control messages. - msgFlags &^= linux.MSG_CTRUNC + if err != nil { + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) + } if senderRequested { senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf) } - return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), socket.ControlMessages{}, syserr.FromError(err) + + unixControlMessages, err := unix.ParseSocketControlMessage(controlBuf[:controlLen]) + if err != nil { + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) + } + + controlMessages := socket.ControlMessages{} + for _, unixCmsg := range unixControlMessages { + switch unixCmsg.Header.Level { + case syscall.SOL_IP: + switch unixCmsg.Header.Type { + case syscall.IP_TOS: + controlMessages.IP.HasTOS = true + binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS) + } + case syscall.SOL_IPV6: + switch unixCmsg.Header.Type { + case syscall.IPV6_TCLASS: + controlMessages.IP.HasTClass = true + binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTClass], usermem.ByteOrder, &controlMessages.IP.TClass) + } + } + } + + return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), controlMessages, nil } // SendMsg implements socket.Socket.SendMsg. @@ -446,6 +492,14 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] return 0, syserr.ErrInvalidArgument } + space := uint64(control.CmsgsSpace(t, controlMessages)) + if space > maxControlLen { + space = maxControlLen + } + controlBuf := make([]byte, 0, space) + // PackControlMessages will append up to space bytes to controlBuf. + controlBuf = control.PackControlMessages(t, controlMessages, controlBuf) + sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) { // Refuse to do anything if any part of src.Addrs was unusable. if uint64(src.NumBytes()) != srcs.NumBytes() { @@ -458,7 +512,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] // We always do a non-blocking send*(). sysflags := flags | syscall.MSG_DONTWAIT - if srcs.NumBlocks() == 1 { + if srcs.NumBlocks() == 1 && len(controlBuf) == 0 { // Skip allocating []syscall.Iovec. src := srcs.Head() n, _, errno := syscall.Syscall6(syscall.SYS_SENDTO, uintptr(s.fd), src.Addr(), uintptr(src.Len()), uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to))) @@ -477,6 +531,10 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] msg.Name = &to[0] msg.Namelen = uint32(len(to)) } + if len(controlBuf) != 0 { + msg.Control = &controlBuf[0] + msg.Controllen = uint64(len(controlBuf)) + } return sendmsg(s.fd, &msg, sysflags) }) diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index d92399efd..140851c17 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -151,6 +151,8 @@ var Metrics = tcpip.Stats{ PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."), CurrentEstablished: mustCreateMetric("/netstack/tcp/current_established", "Number of connections in either ESTABLISHED or CLOSE-WAIT state now."), EstablishedResets: mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"), + EstablishedClosed: mustCreateMetric("/netstack/tcp/established_closed", "number of times established TCP connections made a transition to CLOSED state."), + EstablishedTimedout: mustCreateMetric("/netstack/tcp/established_timedout", "Number of times an established connection was reset because of keep-alive time out."), ListenOverflowSynDrop: mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."), ListenOverflowAckDrop: mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."), ListenOverflowSynCookieSent: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."), @@ -324,7 +326,7 @@ func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, } family := usermem.ByteOrder.Uint16(addr) - if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) { + if family != uint16(sfamily) && (strict || family != linux.AF_UNSPEC) { return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported } @@ -1125,6 +1127,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return int32(time.Duration(v) / time.Second), nil + case linux.TCP_USER_TIMEOUT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.TCPUserTimeoutOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(time.Duration(v) / time.Millisecond), nil + case linux.TCP_INFO: var v tcpip.TCPInfoOption if err := ep.GetSockOpt(&v); err != nil { @@ -1561,6 +1575,17 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v)))) + case linux.TCP_USER_TIMEOUT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := int32(usermem.ByteOrder.Uint32(optVal)) + if v < 0 { + return syserr.ErrInvalidArgument + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v)))) + case linux.TCP_CONGESTION: v := tcpip.CongestionControlOption(optVal) if err := ep.SetSockOpt(v); err != nil { diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto index 9586f5923..b677e9eb3 100644 --- a/pkg/sentry/socket/rpcinet/syscall_rpc.proto +++ b/pkg/sentry/socket/rpcinet/syscall_rpc.proto @@ -3,7 +3,6 @@ syntax = "proto3"; // package syscall_rpc is a set of networking related system calls that can be // forwarded to a socket gofer. // -// TODO(b/77963526): Document individual RPCs. package syscall_rpc; message SendmsgRequest { diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 8c250c325..2389a9cdb 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -43,6 +43,11 @@ type ControlMessages struct { IP tcpip.ControlMessages } +// Release releases Unix domain socket credentials and rights. +func (c *ControlMessages) Release() { + c.Unix.Release() +} + // Socket is the interface containing socket syscalls used by the syscall layer // to redirect them to the appropriate implementation. type Socket interface { diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 1aaae8487..885758054 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -118,6 +118,9 @@ func (s *SocketOperations) Endpoint() transport.Endpoint { func extractPath(sockaddr []byte) (string, *syserr.Error) { addr, _, err := netstack.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */) if err != nil { + if err == syserr.ErrAddressFamilyNotSupported { + err = syserr.ErrInvalidArgument + } return "", err } diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index 72ebf766d..d46421199 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -14,6 +14,7 @@ go_library( "open.go", "poll.go", "ptrace.go", + "select.go", "signal.go", "socket.go", "strace.go", diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go index 5d57b75af..e603f858f 100644 --- a/pkg/sentry/strace/linux64.go +++ b/pkg/sentry/strace/linux64.go @@ -40,7 +40,7 @@ var linuxAMD64 = SyscallMap{ 20: makeSyscallInfo("writev", FD, WriteIOVec, Hex), 21: makeSyscallInfo("access", Path, Oct), 22: makeSyscallInfo("pipe", PipeFDs), - 23: makeSyscallInfo("select", Hex, Hex, Hex, Hex, Timeval), + 23: makeSyscallInfo("select", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timeval), 24: makeSyscallInfo("sched_yield"), 25: makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex), 26: makeSyscallInfo("msync", Hex, Hex, Hex), @@ -287,7 +287,7 @@ var linuxAMD64 = SyscallMap{ 267: makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex), 268: makeSyscallInfo("fchmodat", FD, Path, Mode), 269: makeSyscallInfo("faccessat", FD, Path, Oct, Hex), - 270: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex), + 270: makeSyscallInfo("pselect6", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timespec, SigSet), 271: makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex), 272: makeSyscallInfo("unshare", CloneFlags), 273: makeSyscallInfo("set_robust_list", Hex, Hex), @@ -335,5 +335,33 @@ var linuxAMD64 = SyscallMap{ 315: makeSyscallInfo("sched_getattr", Hex, Hex, Hex), 316: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex), 317: makeSyscallInfo("seccomp", Hex, Hex, Hex), + 318: makeSyscallInfo("getrandom", Hex, Hex, Hex), + 319: makeSyscallInfo("memfd_create", Path, Hex), // Not quite a path, but close. + 320: makeSyscallInfo("kexec_file_load", FD, FD, Hex, Hex, Hex), + 321: makeSyscallInfo("bpf", Hex, Hex, Hex), + 322: makeSyscallInfo("execveat", FD, Path, ExecveStringVector, ExecveStringVector, Hex), + 323: makeSyscallInfo("userfaultfd", Hex), + 324: makeSyscallInfo("membarrier", Hex, Hex), + 325: makeSyscallInfo("mlock2", Hex, Hex, Hex), + 326: makeSyscallInfo("copy_file_range", FD, Hex, FD, Hex, Hex, Hex), + 327: makeSyscallInfo("preadv2", FD, ReadIOVec, Hex, Hex, Hex), + 328: makeSyscallInfo("pwritev2", FD, WriteIOVec, Hex, Hex, Hex), + 329: makeSyscallInfo("pkey_mprotect", Hex, Hex, Hex, Hex), + 330: makeSyscallInfo("pkey_alloc", Hex, Hex), + 331: makeSyscallInfo("pkey_free", Hex), 332: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex), + 333: makeSyscallInfo("io_pgetevents", Hex, Hex, Hex, Hex, Timespec, SigSet), + 334: makeSyscallInfo("rseq", Hex, Hex, Hex, Hex), + 424: makeSyscallInfo("pidfd_send_signal", FD, Signal, Hex, Hex), + 425: makeSyscallInfo("io_uring_setup", Hex, Hex), + 426: makeSyscallInfo("io_uring_enter", FD, Hex, Hex, Hex, SigSet, Hex), + 427: makeSyscallInfo("io_uring_register", FD, Hex, Hex, Hex), + 428: makeSyscallInfo("open_tree", FD, Path, Hex), + 429: makeSyscallInfo("move_mount", FD, Path, FD, Path, Hex), + 430: makeSyscallInfo("fsopen", Path, Hex), // Not quite a path, but close. + 431: makeSyscallInfo("fsconfig", FD, Hex, Hex, Hex, Hex), + 432: makeSyscallInfo("fsmount", FD, Hex, Hex), + 433: makeSyscallInfo("fspick", FD, Path, Hex), + 434: makeSyscallInfo("pidfd_open", Hex, Hex), + 435: makeSyscallInfo("clone3", Hex, Hex), } diff --git a/pkg/sentry/strace/select.go b/pkg/sentry/strace/select.go new file mode 100644 index 000000000..c77d418e6 --- /dev/null +++ b/pkg/sentry/strace/select.go @@ -0,0 +1,56 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package strace + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +func fdsFromSet(t *kernel.Task, set []byte) []int { + var fds []int + // Append n if the n-th bit is 1. + for i, v := range set { + for j := 0; j < 8; j++ { + if (v>>j)&1 == 1 { + fds = append(fds, i*8+j) + } + } + } + return fds +} + +func fdSet(t *kernel.Task, nfds int, addr usermem.Addr) string { + if nfds < 0 { + return fmt.Sprintf("%#x (negative nfds)", addr) + } + if addr == 0 { + return "null" + } + + // Calculate the size of the fd set (one bit per fd). + nBytes := (nfds + 7) / 8 + nBitsInLastPartialByte := nfds % 8 + + set, err := linux.CopyInFDSet(t, addr, nBytes, nBitsInLastPartialByte) + if err != nil { + return fmt.Sprintf("%#x (error decoding fdset: %s)", addr, err) + } + + return fmt.Sprintf("%#x %v", addr, fdsFromSet(t, set)) +} diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index 94334f6d2..51f2efb39 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -208,6 +208,15 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64) i += linux.SizeOfControlMessageHeader width := t.Arch().Width() length := int(h.Length) - linux.SizeOfControlMessageHeader + if length < 0 { + strs = append(strs, fmt.Sprintf( + "{level=%s, type=%s, length=%d, content too short}", + level, + typ, + h.Length, + )) + break + } if skipData { strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length)) diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 311389547..629c1f308 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -439,6 +439,8 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer())) case PollFDs: output = append(output, pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), false)) + case SelectFDSet: + output = append(output, fdSet(t, int(args[0].Int()), args[arg].Pointer())) case Oct: output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8)) case Hex: diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go index 3c389d375..e5d486c4e 100644 --- a/pkg/sentry/strace/syscalls.go +++ b/pkg/sentry/strace/syscalls.go @@ -206,6 +206,10 @@ const ( // PollFDs is an array of struct pollfd. The number of entries in the // array is in the next argument. PollFDs + + // SelectFDSet is an fd_set argument in select(2)/pselect(2). The number of + // fds represented must be the first argument. + SelectFDSet ) // defaultFormat is the syscall argument format to use if the actual format is diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 4c0bf96e4..a76975cee 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -30,6 +30,7 @@ go_library( "sys_random.go", "sys_read.go", "sys_rlimit.go", + "sys_rseq.go", "sys_rusage.go", "sys_sched.go", "sys_seccomp.go", @@ -49,6 +50,7 @@ go_library( "sys_tls.go", "sys_utsname.go", "sys_write.go", + "sys_xattr.go", "timespec.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls/linux", diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go index 81e4f93a6..479c5f6ff 100644 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ b/pkg/sentry/syscalls/linux/linux64_amd64.go @@ -228,10 +228,10 @@ var AMD64 = &kernel.SyscallTable{ 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), 186: syscalls.Supported("gettid", Gettid), 187: syscalls.Supported("readahead", Readahead), - 188: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 188: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil), 189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 191: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil), 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), @@ -260,7 +260,7 @@ var AMD64 = &kernel.SyscallTable{ 217: syscalls.Supported("getdents64", Getdents64), 218: syscalls.Supported("set_tid_address", SetTidAddress), 219: syscalls.Supported("restart_syscall", RestartSyscall), - 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) + 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), 222: syscalls.Supported("timer_create", TimerCreate), 223: syscalls.Supported("timer_settime", TimerSettime), @@ -367,11 +367,31 @@ var AMD64 = &kernel.SyscallTable{ 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - // Syscalls after 325 are "backports" from versions of Linux after 4.4. + // Syscalls implemented after 325 are "backports" from versions + // of Linux after 4.4. 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), 327: syscalls.Supported("preadv2", Preadv2), 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), + 329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), + 330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), + 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), 332: syscalls.Supported("statx", Statx), + 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), + + // Linux skips ahead to syscall 424 to sync numbers between arches. + 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), }, Emulate: map[usermem.Addr]uintptr{ diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go index a809115e0..d3f61f5e8 100644 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -41,10 +41,10 @@ var ARM64 = &kernel.SyscallTable{ 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 5: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 5: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil), 6: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 7: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 8: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 8: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil), 9: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 10: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 11: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), @@ -224,7 +224,7 @@ var ARM64 = &kernel.SyscallTable{ 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 190: syscalls.Supported("semget", Semget), 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), - 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) + 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), 195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), @@ -295,14 +295,33 @@ var ARM64 = &kernel.SyscallTable{ 278: syscalls.Supported("getrandom", GetRandom), 279: syscalls.Supported("memfd_create", MemfdCreate), 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), - 281: syscalls.ErrorWithEvent("execveat", syserror.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836) + 281: syscalls.Supported("execveat", Execveat), 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) 283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), 286: syscalls.Supported("preadv2", Preadv2), 287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), + 288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), + 289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), + 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), 291: syscalls.Supported("statx", Statx), + 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), + + // Linux skips ahead to syscall 424 to sync numbers between arches. + 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), }, Emulate: map[usermem.Addr]uintptr{}, diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 167c2b60b..9bc2445a5 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -171,6 +171,9 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint } } + // Truncate is called when O_TRUNC is specified for any kind of + // existing Dirent. Behavior is delegated to the entry's Truncate + // implementation. if flags&linux.O_TRUNC != 0 { if err := d.Inode.Truncate(t, d, 0); err != nil { return err @@ -397,7 +400,9 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l return err } - // Should we truncate the file? + // Truncate is called when O_TRUNC is specified for any kind of + // existing Dirent. Behavior is delegated to the entry's Truncate + // implementation. if flags&linux.O_TRUNC != 0 { if err := found.Inode.Truncate(t, found, 0); err != nil { return err @@ -835,25 +840,42 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC return uintptr(newfd), nil, nil } -func fGetOwn(t *kernel.Task, file *fs.File) int32 { +func fGetOwnEx(t *kernel.Task, file *fs.File) linux.FOwnerEx { ma := file.Async(nil) if ma == nil { - return 0 + return linux.FOwnerEx{} } a := ma.(*fasync.FileAsync) ot, otg, opg := a.Owner() switch { case ot != nil: - return int32(t.PIDNamespace().IDOfTask(ot)) + return linux.FOwnerEx{ + Type: linux.F_OWNER_TID, + PID: int32(t.PIDNamespace().IDOfTask(ot)), + } case otg != nil: - return int32(t.PIDNamespace().IDOfThreadGroup(otg)) + return linux.FOwnerEx{ + Type: linux.F_OWNER_PID, + PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)), + } case opg != nil: - return int32(-t.PIDNamespace().IDOfProcessGroup(opg)) + return linux.FOwnerEx{ + Type: linux.F_OWNER_PGRP, + PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)), + } default: - return 0 + return linux.FOwnerEx{} } } +func fGetOwn(t *kernel.Task, file *fs.File) int32 { + owner := fGetOwnEx(t, file) + if owner.Type == linux.F_OWNER_PGRP { + return -owner.PID + } + return owner.PID +} + // fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux. // // If who is positive, it represents a PID. If negative, it represents a PGID. @@ -896,11 +918,13 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: flags&linux.FD_CLOEXEC != 0, }) + return 0, nil, nil case linux.F_GETFL: return uintptr(file.Flags().ToLinux()), nil, nil case linux.F_SETFL: flags := uint(args[2].Uint()) file.SetFlags(linuxToFlags(flags).Settable()) + return 0, nil, nil case linux.F_SETLK, linux.F_SETLKW: // In Linux the file system can choose to provide lock operations for an inode. // Normally pipe and socket types lack lock operations. We diverge and use a heavy @@ -1003,6 +1027,44 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_SETOWN: fSetOwn(t, file, args[2].Int()) return 0, nil, nil + case linux.F_GETOWN_EX: + addr := args[2].Pointer() + owner := fGetOwnEx(t, file) + _, err := t.CopyOut(addr, &owner) + return 0, nil, err + case linux.F_SETOWN_EX: + addr := args[2].Pointer() + var owner linux.FOwnerEx + n, err := t.CopyIn(addr, &owner) + if err != nil { + return 0, nil, err + } + a := file.Async(fasync.New).(*fasync.FileAsync) + switch owner.Type { + case linux.F_OWNER_TID: + task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID)) + if task == nil { + return 0, nil, syserror.ESRCH + } + a.SetOwnerTask(t, task) + return uintptr(n), nil, nil + case linux.F_OWNER_PID: + tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID)) + if tg == nil { + return 0, nil, syserror.ESRCH + } + a.SetOwnerThreadGroup(t, tg) + return uintptr(n), nil, nil + case linux.F_OWNER_PGRP: + pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID)) + if pg == nil { + return 0, nil, syserror.ESRCH + } + a.SetOwnerProcessGroup(t, pg) + return uintptr(n), nil, nil + default: + return 0, nil, syserror.EINVAL + } case linux.F_GET_SEALS: val, err := tmpfs.GetSeals(file.Dirent.Inode) return uintptr(val), nil, err @@ -1030,7 +1092,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Everything else is not yet supported. return 0, nil, syserror.EINVAL } - return 0, nil, nil } const ( @@ -1484,6 +1545,8 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if fs.IsDir(d.Inode.StableAttr) { return syserror.EISDIR } + // In contrast to open(O_TRUNC), truncate(2) is only valid for file + // types. if !fs.IsFile(d.Inode.StableAttr) { return syserror.EINVAL } @@ -1522,7 +1585,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, syserror.EINVAL } - // Note that this is different from truncate(2) above, where a + // In contrast to open(O_TRUNC), truncate(2) is only valid for file + // types. Note that this is different from truncate(2) above, where a // directory returns EISDIR. if !fs.IsFile(file.Dirent.Inode.StableAttr) { return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index b9bd25464..bde17a767 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -226,6 +226,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if mask == 0 { return 0, nil, syserror.EINVAL } + if val <= 0 { + // The Linux kernel wakes one waiter even if val is + // non-positive. + val = 1 + } n, err := t.Futex().Wake(t, addr, private, mask, val) return uintptr(n), nil, err @@ -242,6 +247,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.FUTEX_WAKE_OP: op := uint32(val3) + if val <= 0 { + // The Linux kernel wakes one waiter even if val is + // non-positive. + val = 1 + } n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op) return uintptr(n), nil, err diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index 7a13beac2..2b2df989a 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -197,53 +197,51 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) return remainingTimeout, n, err } +// CopyInFDSet copies an fd set from select(2)/pselect(2). +func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) { + set := make([]byte, nBytes) + + if addr != 0 { + if _, err := t.CopyIn(addr, &set); err != nil { + return nil, err + } + // If we only use part of the last byte, mask out the extraneous bits. + // + // N.B. This only works on little-endian architectures. + if nBitsInLastPartialByte != 0 { + set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte + } + } + return set, nil +} + func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) { if nfds < 0 || nfds > fileCap { return 0, syserror.EINVAL } - // Capture all the provided input vectors. - // - // N.B. This only works on little-endian architectures. - byteCount := (nfds + 7) / 8 - - bitsInLastPartialByte := uint(nfds % 8) - r := make([]byte, byteCount) - w := make([]byte, byteCount) - e := make([]byte, byteCount) + // Calculate the size of the fd sets (one bit per fd). + nBytes := (nfds + 7) / 8 + nBitsInLastPartialByte := nfds % 8 - if readFDs != 0 { - if _, err := t.CopyIn(readFDs, &r); err != nil { - return 0, err - } - // Mask out bits above nfds. - if bitsInLastPartialByte != 0 { - r[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte - } + // Capture all the provided input vectors. + r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err } - - if writeFDs != 0 { - if _, err := t.CopyIn(writeFDs, &w); err != nil { - return 0, err - } - if bitsInLastPartialByte != 0 { - w[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte - } + w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err } - - if exceptFDs != 0 { - if _, err := t.CopyIn(exceptFDs, &e); err != nil { - return 0, err - } - if bitsInLastPartialByte != 0 { - e[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte - } + e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err } // Count how many FDs are actually being requested so that we can build // a PollFD array. fdCount := 0 - for i := 0; i < byteCount; i++ { + for i := 0; i < nBytes; i++ { v := r[i] | w[i] | e[i] for v != 0 { v &= (v - 1) @@ -254,7 +252,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add // Build the PollFD array. pfd := make([]linux.PollFD, 0, fdCount) var fd int32 - for i := 0; i < byteCount; i++ { + for i := 0; i < nBytes; i++ { rV, wV, eV := r[i], w[i], e[i] v := rV | wV | eV m := byte(1) @@ -295,8 +293,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add } // Do the syscall, then count the number of bits set. - _, _, err := pollBlock(t, pfd, timeout) - if err != nil { + if _, _, err = pollBlock(t, pfd, timeout); err != nil { return 0, syserror.ConvertIntr(err, syserror.EINTR) } diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go new file mode 100644 index 000000000..90db10ea6 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_rseq.go @@ -0,0 +1,48 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// RSeq implements syscall rseq(2). +func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Uint() + flags := args[2].Int() + signature := args[3].Uint() + + if !t.RSeqAvailable() { + // Event for applications that want rseq on a configuration + // that doesn't support them. + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + } + + switch flags { + case 0: + // Register. + return 0, nil, t.SetRSeq(addr, length, signature) + case linux.RSEQ_FLAG_UNREGISTER: + return 0, nil, t.ClearRSeq(addr, length, signature) + default: + // Unknown flag. + return 0, nil, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go index d57ffb3a1..4a8bc24a2 100644 --- a/pkg/sentry/syscalls/linux/sys_shm.go +++ b/pkg/sentry/syscalls/linux/sys_shm.go @@ -39,10 +39,13 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } + defer segment.DecRef() return uintptr(segment.ID), nil, nil } // findSegment retrives a shm segment by the given id. +// +// findSegment returns a reference on Shm. func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) { r := t.IPCNamespace().ShmRegistry() segment := r.FindByID(id) @@ -63,6 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, syserror.EINVAL } + defer segment.DecRef() opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{ Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC, @@ -72,7 +76,6 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - defer segment.DecRef() addr, err = t.MemoryManager().MMap(t, opts) return uintptr(addr), nil, err } @@ -105,6 +108,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } + defer segment.DecRef() stat, err := segment.IPCStat(t) if err == nil { @@ -128,6 +132,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } + defer segment.DecRef() switch cmd { case linux.IPC_SET: diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index ab1001f16..4b5aafcc0 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -764,7 +764,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC - cms.Unix.Release() + cms.Release() } if int(msg.Flags) != mflags { @@ -784,24 +784,16 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } - defer cms.Unix.Release() + defer cms.Release() controlData := make([]byte, 0, msg.ControlLen) + controlData = control.PackControlMessages(t, cms, controlData) if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() { creds, _ := cms.Unix.Credentials.(control.SCMCredentials) controlData, mflags = control.PackCredentials(t, creds, controlData, mflags) } - if cms.IP.HasTimestamp { - controlData = control.PackTimestamp(t, cms.IP.Timestamp, controlData) - } - - if cms.IP.HasInq { - // In Linux, TCP_CM_INQ is added after SO_TIMESTAMP. - controlData = control.PackInq(t, cms.IP.Inq, controlData) - } - if cms.Unix.Rights != nil { controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) } @@ -877,7 +869,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag } n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) - cm.Unix.Release() + cm.Release() if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } @@ -1060,7 +1052,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme } // Call the syscall implementation. - n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: controlMessages}) + n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) if err != nil { controlMessages.Release() diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go new file mode 100644 index 000000000..97d9a65ea --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -0,0 +1,169 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Getxattr implements linux syscall getxattr(2). +func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + valueLen := 0 + err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + value, err := getxattr(t, d, dirPath, nameAddr) + if err != nil { + return err + } + + valueLen = len(value) + if size == 0 { + return nil + } + if size > linux.XATTR_SIZE_MAX { + size = linux.XATTR_SIZE_MAX + } + if valueLen > int(size) { + return syserror.ERANGE + } + + _, err = t.CopyOutBytes(valueAddr, []byte(value)) + return err + }) + if err != nil { + return 0, nil, err + } + return uintptr(valueLen), nil, nil +} + +// getxattr implements getxattr from the given *fs.Dirent. +func getxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr) (string, error) { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return "", syserror.ENOTDIR + } + + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil { + return "", err + } + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return "", err + } + + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return "", syserror.EOPNOTSUPP + } + + return d.Inode.Getxattr(name) +} + +// Setxattr implements linux syscall setxattr(2). +func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + flags := args[4].Uint() + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return 0, nil, syserror.EINVAL + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return setxattr(t, d, dirPath, nameAddr, valueAddr, size, flags) + }) +} + +// setxattr implements setxattr from the given *fs.Dirent. +func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint, flags uint32) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil { + return err + } + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + + if size > linux.XATTR_SIZE_MAX { + return syserror.E2BIG + } + buf := make([]byte, size) + if _, err = t.CopyInBytes(valueAddr, buf); err != nil { + return err + } + value := string(buf) + + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + + return d.Inode.Setxattr(name, value) +} + +func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { + name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) + if err != nil { + if err == syserror.ENAMETOOLONG { + return "", syserror.ERANGE + } + return "", err + } + if len(name) == 0 { + return "", syserror.ERANGE + } + return name, nil +} + +func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error { + // Restrict xattrs to regular files and directories. + // + // In Linux, this restriction technically only applies to xattrs in the + // "user.*" namespace, but we don't allow any other xattr prefixes anyway. + if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) { + if perms.Write { + return syserror.EPERM + } + return syserror.ENODATA + } + + return i.CheckPermission(t, perms) +} diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 74a325309..4c6aa04a1 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -9,6 +9,7 @@ go_library( "context.go", "debug.go", "dentry.go", + "device.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", @@ -17,9 +18,9 @@ go_library( "mount.go", "mount_unsafe.go", "options.go", + "pathname.go", "permissions.go", "resolving_path.go", - "syscalls.go", "testutil.go", "vfs.go", ], diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index 32cf9151b..705194ebc 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -24,6 +24,9 @@ type contextID int const ( // CtxMountNamespace is a Context.Value key for a MountNamespace. CtxMountNamespace contextID = iota + + // CtxRoot is a Context.Value key for a VFS root. + CtxRoot ) // MountNamespaceFromContext returns the MountNamespace used by ctx. It does @@ -35,3 +38,13 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace { } return nil } + +// RootFromContext returns the VFS root used by ctx. It takes a reference on +// the returned VirtualDentry. If ctx does not have a specific VFS root, +// RootFromContext returns a zero-value VirtualDentry. +func RootFromContext(ctx context.Context) VirtualDentry { + if v := ctx.Value(CtxRoot); v != nil { + return v.(VirtualDentry) + } + return VirtualDentry{} +} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 40f4c1d09..1bc9c4a38 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -85,12 +85,12 @@ type Dentry struct { // mounts is accessed using atomic memory operations. mounts uint32 - // mu synchronizes disowning and mounting over this Dentry. - mu sync.Mutex - // children are child Dentries. children map[string]*Dentry + // mu synchronizes disowning and mounting over this Dentry. + mu sync.Mutex + // impl is the DentryImpl associated with this Dentry. impl is immutable. // This should be the last field in Dentry. impl DentryImpl @@ -199,6 +199,18 @@ func (d *Dentry) HasChildren() bool { return len(d.children) != 0 } +// Children returns a map containing all of d's children. +func (d *Dentry) Children() map[string]*Dentry { + if !d.HasChildren() { + return nil + } + m := make(map[string]*Dentry) + for name, child := range d.children { + m[name] = child + } + return m +} + // InsertChild makes child a child of d with the given name. // // InsertChild is a mutator of d and child. @@ -222,6 +234,18 @@ func (d *Dentry) InsertChild(child *Dentry, name string) { child.name = name } +// IsAncestorOf returns true if d is an ancestor of d2; that is, d is either +// d2's parent or an ancestor of d2's parent. +func (d *Dentry) IsAncestorOf(d2 *Dentry) bool { + for d2.parent != nil { + if d2.parent == d { + return true + } + d2 = d2.parent + } + return false +} + // PrepareDeleteDentry must be called before attempting to delete the file // represented by d. If PrepareDeleteDentry succeeds, the caller must call // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. @@ -271,21 +295,6 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { } } -// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as -// appropriate for in-memory filesystems that don't need to ensure that some -// external state change succeeds before committing the deletion. -// -// DeleteDentry is a mutator of d and d.Parent(). -// -// Preconditions: d is a child Dentry. -func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error { - if err := vfs.PrepareDeleteDentry(mntns, d); err != nil { - return err - } - vfs.CommitDeleteDentry(d) - return nil -} - // ForceDeleteDentry causes d to become disowned. It should only be used in // cases where VFS has no ability to stop the deletion (e.g. d represents the // local state of a file on a remote filesystem on which the file has already @@ -314,7 +323,7 @@ func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) { // CommitRenameExchangeDentry depending on the rename's outcome. // // Preconditions: from is a child Dentry. If to is not nil, it must be a child -// Dentry from the same Filesystem. +// Dentry from the same Filesystem. from != to. func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { if checkInvariants { if from.parent == nil { diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go new file mode 100644 index 000000000..cb672e36f --- /dev/null +++ b/pkg/sentry/vfs/device.go @@ -0,0 +1,100 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +// DeviceKind indicates whether a device is a block or character device. +type DeviceKind uint32 + +const ( + // BlockDevice indicates a block device. + BlockDevice DeviceKind = iota + + // CharDevice indicates a character device. + CharDevice +) + +// String implements fmt.Stringer.String. +func (kind DeviceKind) String() string { + switch kind { + case BlockDevice: + return "block" + case CharDevice: + return "character" + default: + return fmt.Sprintf("invalid device kind %d", kind) + } +} + +type devTuple struct { + kind DeviceKind + major uint32 + minor uint32 +} + +// A Device backs device special files. +type Device interface { + // Open returns a FileDescription representing this device. + Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error) +} + +type registeredDevice struct { + dev Device + opts RegisterDeviceOptions +} + +// RegisterDeviceOptions contains options to +// VirtualFilesystem.RegisterDevice(). +type RegisterDeviceOptions struct { + // GroupName is the name shown for this device registration in + // /proc/devices. If GroupName is empty, this registration will not be + // shown in /proc/devices. + GroupName string +} + +// RegisterDevice registers the given Device in vfs with the given major and +// minor device numbers. +func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error { + tup := devTuple{kind, major, minor} + vfs.devicesMu.Lock() + defer vfs.devicesMu.Unlock() + if existing, ok := vfs.devices[tup]; ok { + return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev) + } + vfs.devices[tup] = ®isteredDevice{ + dev: dev, + opts: *opts, + } + return nil +} + +// OpenDeviceSpecialFile returns a FileDescription representing the given +// device. +func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) { + tup := devTuple{kind, major, minor} + vfs.devicesMu.RLock() + defer vfs.devicesMu.RUnlock() + rd, ok := vfs.devices[tup] + if !ok { + return nil, syserror.ENXIO + } + return rd.dev.Open(ctx, mnt, d, *opts) +} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 34007eb57..6afe280bc 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -20,8 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -38,49 +40,58 @@ type FileDescription struct { // operations. refs int64 + // statusFlags contains status flags, "initialized by open(2) and possibly + // modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic + // memory operations. + statusFlags uint32 + // vd is the filesystem location at which this FileDescription was opened. // A reference is held on vd. vd is immutable. vd VirtualDentry + opts FileDescriptionOptions + // impl is the FileDescriptionImpl associated with this Filesystem. impl is // immutable. This should be the last field in FileDescription. impl FileDescriptionImpl } -// Init must be called before first use of fd. It takes ownership of references -// on mnt and d held by the caller. -func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) { +// FileDescriptionOptions contains options to FileDescription.Init(). +type FileDescriptionOptions struct { + // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is + // usually only the case if O_DIRECT would actually have an effect. + AllowDirectIO bool + + // If UseDentryMetadata is true, calls to FileDescription methods that + // interact with file and filesystem metadata (Stat, SetStat, StatFS, + // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling + // the corresponding FilesystemImpl methods instead of the corresponding + // FileDescriptionImpl methods. + // + // UseDentryMetadata is intended for file descriptions that are implemented + // outside of individual filesystems, such as pipes, sockets, and device + // special files. FileDescriptions for which UseDentryMetadata is true may + // embed DentryMetadataFileDescriptionImpl to obtain appropriate + // implementations of FileDescriptionImpl methods that should not be + // called. + UseDentryMetadata bool +} + +// Init must be called before first use of fd. It takes references on mnt and +// d. statusFlags is the initial file description status flags, which is +// usually the full set of flags passed to open(2). +func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) { fd.refs = 1 + fd.statusFlags = statusFlags | linux.O_LARGEFILE fd.vd = VirtualDentry{ mount: mnt, dentry: d, } + fd.vd.IncRef() + fd.opts = *opts fd.impl = impl } -// Impl returns the FileDescriptionImpl associated with fd. -func (fd *FileDescription) Impl() FileDescriptionImpl { - return fd.impl -} - -// Mount returns the mount on which fd was opened. It does not take a reference -// on the returned Mount. -func (fd *FileDescription) Mount() *Mount { - return fd.vd.mount -} - -// Dentry returns the dentry at which fd was opened. It does not take a -// reference on the returned Dentry. -func (fd *FileDescription) Dentry() *Dentry { - return fd.vd.dentry -} - -// VirtualDentry returns the location at which fd was opened. It does not take -// a reference on the returned VirtualDentry. -func (fd *FileDescription) VirtualDentry() VirtualDentry { - return fd.vd -} - // IncRef increments fd's reference count. func (fd *FileDescription) IncRef() { atomic.AddInt64(&fd.refs, 1) @@ -112,6 +123,82 @@ func (fd *FileDescription) DecRef() { } } +// Mount returns the mount on which fd was opened. It does not take a reference +// on the returned Mount. +func (fd *FileDescription) Mount() *Mount { + return fd.vd.mount +} + +// Dentry returns the dentry at which fd was opened. It does not take a +// reference on the returned Dentry. +func (fd *FileDescription) Dentry() *Dentry { + return fd.vd.dentry +} + +// VirtualDentry returns the location at which fd was opened. It does not take +// a reference on the returned VirtualDentry. +func (fd *FileDescription) VirtualDentry() VirtualDentry { + return fd.vd +} + +// StatusFlags returns file description status flags, as for fcntl(F_GETFL). +func (fd *FileDescription) StatusFlags() uint32 { + return atomic.LoadUint32(&fd.statusFlags) +} + +// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL). +func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error { + // Compare Linux's fs/fcntl.c:setfl(). + oldFlags := fd.StatusFlags() + // Linux documents this check as "O_APPEND cannot be cleared if the file is + // marked as append-only and the file is open for write", which would make + // sense. However, the check as actually implemented seems to be "O_APPEND + // cannot be changed if the file is marked as append-only". + if (flags^oldFlags)&linux.O_APPEND != 0 { + stat, err := fd.Stat(ctx, StatOptions{ + // There is no mask bit for stx_attributes. + Mask: 0, + // Linux just reads inode::i_flags directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil { + return err + } + if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) { + return syserror.EPERM + } + } + if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) { + stat, err := fd.Stat(ctx, StatOptions{ + Mask: linux.STATX_UID, + // Linux's inode_owner_or_capable() just reads inode::i_uid + // directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil { + return err + } + if stat.Mask&linux.STATX_UID == 0 { + return syserror.EPERM + } + if !CanActAsOwner(creds, auth.KUID(stat.UID)) { + return syserror.EPERM + } + } + if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO { + return syserror.EINVAL + } + // TODO(jamieliu): FileDescriptionImpl.SetOAsync()? + const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK + atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags)) + return nil +} + +// Impl returns the FileDescriptionImpl associated with fd. +func (fd *FileDescription) Impl() FileDescriptionImpl { + return fd.impl +} + // FileDescriptionImpl contains implementation details for an FileDescription. // Implementations of FileDescriptionImpl should contain their associated // FileDescription by value as their first field. @@ -120,6 +207,8 @@ func (fd *FileDescription) DecRef() { // be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and // auth.KGID respectively). // +// All methods may return errors not specified. +// // FileDescriptionImpl is analogous to Linux's struct file_operations. type FileDescriptionImpl interface { // Release is called when the associated FileDescription reaches zero @@ -131,14 +220,6 @@ type FileDescriptionImpl interface { // prevent the file descriptor from being closed. OnClose(ctx context.Context) error - // StatusFlags returns file description status flags, as for - // fcntl(F_GETFL). - StatusFlags(ctx context.Context) (uint32, error) - - // SetStatusFlags sets file description status flags, as for - // fcntl(F_SETFL). - SetStatusFlags(ctx context.Context, flags uint32) error - // Stat returns metadata for the file represented by the FileDescription. Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) @@ -156,6 +237,10 @@ type FileDescriptionImpl interface { // PRead reads from the file into dst, starting at the given offset, and // returns the number of bytes read. PRead is permitted to return partial // reads with a nil error. + // + // Errors: + // + // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP. PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) // Read is similar to PRead, but does not specify an offset. @@ -165,6 +250,10 @@ type FileDescriptionImpl interface { // the number of bytes read; note that POSIX 2.9.7 "Thread Interactions // with Regular File Operations" requires that all operations that may // mutate the FileDescription offset are serialized. + // + // Errors: + // + // - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP. Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) // PWrite writes src to the file, starting at the given offset, and returns @@ -174,6 +263,11 @@ type FileDescriptionImpl interface { // As in Linux (but not POSIX), if O_APPEND is in effect for the // FileDescription, PWrite should ignore the offset and append data to the // end of the file. + // + // Errors: + // + // - If opts.Flags specifies unsupported options, PWrite returns + // EOPNOTSUPP. PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) // Write is similar to PWrite, but does not specify an offset, which is @@ -183,6 +277,10 @@ type FileDescriptionImpl interface { // PWrite that uses a FileDescription offset, to make it possible for // remote filesystems to implement O_APPEND correctly (i.e. atomically with // respect to writers outside the scope of VFS). + // + // Errors: + // + // - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP. Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) // IterDirents invokes cb on each entry in the directory represented by the @@ -212,7 +310,21 @@ type FileDescriptionImpl interface { // Ioctl implements the ioctl(2) syscall. Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) - // TODO: extended attributes; file locking + // Listxattr returns all extended attribute names for the file. + Listxattr(ctx context.Context) ([]string, error) + + // Getxattr returns the value associated with the given extended attribute + // for the file. + Getxattr(ctx context.Context, name string) (string, error) + + // Setxattr changes the value associated with the given extended attribute + // for the file. + Setxattr(ctx context.Context, opts SetxattrOptions) error + + // Removexattr removes the given extended attribute from the file. + Removexattr(ctx context.Context, name string) error + + // TODO: file locking } // Dirent holds the information contained in struct linux_dirent64. @@ -241,3 +353,230 @@ type IterDirentsCallback interface { // called. Handle(dirent Dirent) bool } + +// OnClose is called when a file descriptor representing the FileDescription is +// closed. Returning a non-nil error should not prevent the file descriptor +// from being closed. +func (fd *FileDescription) OnClose(ctx context.Context) error { + return fd.impl.OnClose(ctx) +} + +// Stat returns metadata for the file represented by fd. +func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts) + vfsObj.putResolvingPath(rp) + return stat, err + } + return fd.impl.Stat(ctx, opts) +} + +// SetStat updates metadata for the file represented by fd. +func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts) + vfsObj.putResolvingPath(rp) + return err + } + return fd.impl.SetStat(ctx, opts) +} + +// StatFS returns metadata for the filesystem containing the file represented +// by fd. +func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp) + vfsObj.putResolvingPath(rp) + return statfs, err + } + return fd.impl.StatFS(ctx) +} + +// PRead reads from the file represented by fd into dst, starting at the given +// offset, and returns the number of bytes read. PRead is permitted to return +// partial reads with a nil error. +func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return fd.impl.PRead(ctx, dst, offset, opts) +} + +// Read is similar to PRead, but does not specify an offset. +func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return fd.impl.Read(ctx, dst, opts) +} + +// PWrite writes src to the file represented by fd, starting at the given +// offset, and returns the number of bytes written. PWrite is permitted to +// return partial writes with a nil error. +func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return fd.impl.PWrite(ctx, src, offset, opts) +} + +// Write is similar to PWrite, but does not specify an offset. +func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return fd.impl.Write(ctx, src, opts) +} + +// IterDirents invokes cb on each entry in the directory represented by fd. If +// IterDirents has been called since the last call to Seek, it continues +// iteration from the end of the last call. +func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error { + return fd.impl.IterDirents(ctx, cb) +} + +// Seek changes fd's offset (assuming one exists) and returns its new value. +func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return fd.impl.Seek(ctx, offset, whence) +} + +// Sync has the semantics of fsync(2). +func (fd *FileDescription) Sync(ctx context.Context) error { + return fd.impl.Sync(ctx) +} + +// ConfigureMMap mutates opts to implement mmap(2) for the file represented by +// fd. +func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return fd.impl.ConfigureMMap(ctx, opts) +} + +// Ioctl implements the ioctl(2) syscall. +func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return fd.impl.Ioctl(ctx, uio, args) +} + +// Listxattr returns all extended attribute names for the file represented by +// fd. +func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp) + vfsObj.putResolvingPath(rp) + return names, err + } + names, err := fd.impl.Listxattr(ctx) + if err == syserror.ENOTSUP { + // Linux doesn't actually return ENOTSUP in this case; instead, + // fs/xattr.c:vfs_listxattr() falls back to allowing the security + // subsystem to return security extended attributes, which by default + // don't exist. + return nil, nil + } + return names, err +} + +// Getxattr returns the value associated with the given extended attribute for +// the file represented by fd. +func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name) + vfsObj.putResolvingPath(rp) + return val, err + } + return fd.impl.Getxattr(ctx, name) +} + +// Setxattr changes the value associated with the given extended attribute for +// the file represented by fd. +func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts) + vfsObj.putResolvingPath(rp) + return err + } + return fd.impl.Setxattr(ctx, opts) +} + +// Removexattr removes the given extended attribute from the file represented +// by fd. +func (fd *FileDescription) Removexattr(ctx context.Context, name string) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name) + vfsObj.putResolvingPath(rp) + return err + } + return fd.impl.Removexattr(ctx, name) +} + +// SyncFS instructs the filesystem containing fd to execute the semantics of +// syncfs(2). +func (fd *FileDescription) SyncFS(ctx context.Context) error { + return fd.vd.mount.fs.impl.Sync(ctx) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (fd *FileDescription) MappedName(ctx context.Context) string { + vfsroot := RootFromContext(ctx) + s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd) + if vfsroot.Ok() { + vfsroot.DecRef() + } + return s +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (fd *FileDescription) DeviceID() uint64 { + stat, err := fd.Stat(context.Background(), StatOptions{ + // There is no STATX_DEV; we assume that Stat will return it if it's + // available regardless of mask. + Mask: 0, + // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev + // directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil { + return 0 + } + return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor)) +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (fd *FileDescription) InodeID() uint64 { + stat, err := fd.Stat(context.Background(), StatOptions{ + Mask: linux.STATX_INO, + // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil || stat.Mask&linux.STATX_INO == 0 { + return 0 + } + return stat.Ino +} + +// Msync implements memmap.MappingIdentity.Msync. +func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error { + return fd.Sync(ctx) +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 4fbad7840..66eb57bc2 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -127,6 +127,31 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg return 0, syserror.ENOTTY } +// Listxattr implements FileDescriptionImpl.Listxattr analogously to +// inode_operations::listxattr == NULL in Linux. +func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) { + // This isn't exactly accurate; see FileDescription.Listxattr. + return nil, syserror.ENOTSUP +} + +// Getxattr implements FileDescriptionImpl.Getxattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) { + return "", syserror.ENOTSUP +} + +// Setxattr implements FileDescriptionImpl.Setxattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error { + return syserror.ENOTSUP +} + +// Removexattr implements FileDescriptionImpl.Removexattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error { + return syserror.ENOTSUP +} + // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl that always represent directories to obtain // implementations of non-directory I/O methods that return EISDIR. @@ -152,6 +177,21 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme return 0, syserror.EISDIR } +// DentryMetadataFileDescriptionImpl may be embedded by implementations of +// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is +// true to obtain implementations of Stat and SetStat that panic. +type DentryMetadataFileDescriptionImpl struct{} + +// Stat implements FileDescriptionImpl.Stat. +func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + panic("illegal call to DentryMetadataFileDescriptionImpl.Stat") +} + +// SetStat implements FileDescriptionImpl.SetStat. +func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error { + panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat") +} + // DynamicBytesFileDescriptionImpl may be embedded by implementations of // FileDescriptionImpl that represent read-only regular files whose contents // are backed by a bytes.Buffer that is regenerated when necessary, consistent @@ -174,6 +214,17 @@ type DynamicBytesSource interface { Generate(ctx context.Context, buf *bytes.Buffer) error } +// StaticData implements DynamicBytesSource over a static string. +type StaticData struct { + Data string +} + +// Generate implements DynamicBytesSource. +func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(s.Data) + return nil +} + // SetDataSource must be called exactly once on fd before first use. func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) { fd.data = data @@ -252,3 +303,12 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6 fd.off = offset return offset, nil } + +// GenericConfigureMMap may be used by most implementations of +// FileDescriptionImpl.ConfigureMMap. +func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error { + opts.Mappable = m + opts.MappingIdentity = fd + fd.IncRef() + return nil +} diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index a5561dcbe..9ed58512f 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -48,7 +48,7 @@ type genCountFD struct { func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription { var fd genCountFD - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, 0 /* statusFlags */, mnt, vfsd, &FileDescriptionOptions{}) fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd) return &fd.vfsfd } @@ -89,7 +89,7 @@ func TestGenCountFD(t *testing.T) { creds := auth.CredentialsFromContext(ctx) vfsObj := New() // vfs.New() - vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}) + vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}, &RegisterFilesystemTypeOptions{}) mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{}) if err != nil { t.Fatalf("failed to create testfs root mount: %v", err) @@ -103,7 +103,7 @@ func TestGenCountFD(t *testing.T) { // The first read causes Generate to be called to fill the FD's buffer. buf := make([]byte, 2) ioseq := usermem.BytesIOSequence(buf) - n, err := fd.Impl().Read(ctx, ioseq, ReadOptions{}) + n, err := fd.Read(ctx, ioseq, ReadOptions{}) if n != 1 || (err != nil && err != io.EOF) { t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err) } @@ -112,17 +112,17 @@ func TestGenCountFD(t *testing.T) { } // A second read without seeking is still at EOF. - n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{}) + n, err = fd.Read(ctx, ioseq, ReadOptions{}) if n != 0 || err != io.EOF { t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err) } // Seeking to the beginning of the file causes it to be regenerated. - n, err = fd.Impl().Seek(ctx, 0, linux.SEEK_SET) + n, err = fd.Seek(ctx, 0, linux.SEEK_SET) if n != 0 || err != nil { t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err) } - n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{}) + n, err = fd.Read(ctx, ioseq, ReadOptions{}) if n != 1 || (err != nil && err != io.EOF) { t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err) } @@ -131,7 +131,7 @@ func TestGenCountFD(t *testing.T) { } // PRead at the beginning of the file also causes it to be regenerated. - n, err = fd.Impl().PRead(ctx, ioseq, 0, ReadOptions{}) + n, err = fd.PRead(ctx, ioseq, 0, ReadOptions{}) if n != 1 || (err != nil && err != io.EOF) { t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err) } diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 76ff8cf51..ea78f555b 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" ) @@ -47,6 +48,9 @@ func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) { fs.refs = 1 fs.vfs = vfsObj fs.impl = impl + vfsObj.filesystemsMu.Lock() + vfsObj.filesystems[fs] = struct{}{} + vfsObj.filesystemsMu.Unlock() } // VirtualFilesystem returns the containing VirtualFilesystem. @@ -66,9 +70,28 @@ func (fs *Filesystem) IncRef() { } } +// TryIncRef increments fs' reference count and returns true. If fs' reference +// count is zero, TryIncRef does nothing and returns false. +// +// TryIncRef does not require that a reference is held on fs. +func (fs *Filesystem) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&fs.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) { + return true + } + } +} + // DecRef decrements fs' reference count. func (fs *Filesystem) DecRef() { if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { + fs.vfs.filesystemsMu.Lock() + delete(fs.vfs.filesystems, fs) + fs.vfs.filesystemsMu.Unlock() fs.impl.Release() } else if refs < 0 { panic("Filesystem.decRef() called without holding a reference") @@ -85,6 +108,24 @@ func (fs *Filesystem) DecRef() { // (responsible for actually implementing the operation) isn't known until path // resolution is complete. // +// Unless otherwise specified, FilesystemImpl methods are responsible for +// performing permission checks. In many cases, vfs package functions in +// permissions.go may be used to help perform these checks. +// +// When multiple specified error conditions apply to a given method call, the +// implementation may return any applicable errno unless otherwise specified, +// but returning the earliest error specified is preferable to maximize +// compatibility with Linux. +// +// All methods may return errors not specified, notably including: +// +// - ENOENT if a required path component does not exist. +// +// - ENOTDIR if an intermediate path component is not a directory. +// +// - Errors from vfs-package functions (ResolvingPath.Resolve*(), +// Mount.CheckBeginWrite(), permission-checking functions, etc.) +// // For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid // should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID // and auth.KGID respectively). @@ -107,46 +148,223 @@ type FilesystemImpl interface { // GetDentryAt does not correspond directly to a Linux syscall; it is used // in the implementation of: // - // - Syscalls that need to resolve two paths: rename(), renameat(), - // renameat2(), link(), linkat(). + // - Syscalls that need to resolve two paths: link(), linkat(). // // - Syscalls that need to refer to a filesystem position outside the // context of a file description: chdir(), fchdir(), chroot(), mount(), // umount(). GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) + // GetParentDentryAt returns a Dentry representing the directory at the + // second-to-last path component in rp. (Note that, despite the name, this + // is not necessarily the parent directory of the file at rp, since the + // last path component in rp may be "." or "..".) A reference is taken on + // the returned Dentry. + // + // GetParentDentryAt does not correspond directly to a Linux syscall; it is + // used in the implementation of the rename() family of syscalls, which + // must resolve the parent directories of two paths. + // + // Preconditions: !rp.Done(). + // + // Postconditions: If GetParentDentryAt returns a nil error, then + // rp.Final(). If GetParentDentryAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) + // LinkAt creates a hard link at rp representing the same file as vd. It // does not take ownership of references on vd. // - // The implementation is responsible for checking that vd.Mount() == - // rp.Mount(), and that vd does not represent a directory. + // Errors: + // + // - If the last path component in rp is "." or "..", LinkAt returns + // EEXIST. + // + // - If a file already exists at rp, LinkAt returns EEXIST. + // + // - If rp.MustBeDir(), LinkAt returns ENOENT. + // + // - If the directory in which the link would be created has been removed + // by RmdirAt or RenameAt, LinkAt returns ENOENT. + // + // - If rp.Mount != vd.Mount(), LinkAt returns EXDEV. + // + // - If vd represents a directory, LinkAt returns EPERM. + // + // - If vd represents a file for which all existing links have been + // removed, or a file created by open(O_TMPFILE|O_EXCL), LinkAt returns + // ENOENT. Equivalently, if vd represents a file with a link count of 0 not + // created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If LinkAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error // MkdirAt creates a directory at rp. + // + // Errors: + // + // - If the last path component in rp is "." or "..", MkdirAt returns + // EEXIST. + // + // - If a file already exists at rp, MkdirAt returns EEXIST. + // + // - If the directory in which the new directory would be created has been + // removed by RmdirAt or RenameAt, MkdirAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If MkdirAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error // MknodAt creates a regular file, device special file, or named pipe at // rp. + // + // Errors: + // + // - If the last path component in rp is "." or "..", MknodAt returns + // EEXIST. + // + // - If a file already exists at rp, MknodAt returns EEXIST. + // + // - If rp.MustBeDir(), MknodAt returns ENOENT. + // + // - If the directory in which the file would be created has been removed + // by RmdirAt or RenameAt, MknodAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If MknodAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error // OpenAt returns an FileDescription providing access to the file at rp. A // reference is taken on the returned FileDescription. + // + // Errors: + // + // - If opts.Flags specifies O_TMPFILE and this feature is unsupported by + // the implementation, OpenAt returns EOPNOTSUPP. (All other unsupported + // features are silently ignored, consistently with Linux's open*(2).) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) // ReadlinkAt returns the target of the symbolic link at rp. + // + // Errors: + // + // - If the file at rp is not a symbolic link, ReadlinkAt returns EINVAL. ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) - // RenameAt renames the Dentry represented by vd to rp. It does not take - // ownership of references on vd. + // RenameAt renames the file named oldName in directory oldParentVD to rp. + // It does not take ownership of references on oldParentVD. + // + // Errors [1]: + // + // - If opts.Flags specifies unsupported options, RenameAt returns EINVAL. + // + // - If the last path component in rp is "." or "..", and opts.Flags + // contains RENAME_NOREPLACE, RenameAt returns EEXIST. + // + // - If the last path component in rp is "." or "..", and opts.Flags does + // not contain RENAME_NOREPLACE, RenameAt returns EBUSY. + // + // - If rp.Mount != oldParentVD.Mount(), RenameAt returns EXDEV. + // + // - If the renamed file is not a directory, and opts.MustBeDir is true, + // RenameAt returns ENOTDIR. + // + // - If renaming would replace an existing file and opts.Flags contains + // RENAME_NOREPLACE, RenameAt returns EEXIST. + // + // - If there is no existing file at rp and opts.Flags contains + // RENAME_EXCHANGE, RenameAt returns ENOENT. + // + // - If there is an existing non-directory file at rp, and rp.MustBeDir() + // is true, RenameAt returns ENOTDIR. + // + // - If the renamed file is not a directory, opts.Flags does not contain + // RENAME_EXCHANGE, and rp.MustBeDir() is true, RenameAt returns ENOTDIR. + // (This check is not subsumed by the check for directory replacement below + // since it applies even if there is no file to replace.) + // + // - If the renamed file is a directory, and the new parent directory of + // the renamed file is either the renamed directory or a descendant + // subdirectory of the renamed directory, RenameAt returns EINVAL. + // + // - If renaming would exchange the renamed file with an ancestor directory + // of the renamed file, RenameAt returns EINVAL. + // + // - If renaming would replace an ancestor directory of the renamed file, + // RenameAt returns ENOTEMPTY. (This check would be subsumed by the + // non-empty directory check below; however, this check takes place before + // the self-rename check.) + // + // - If the renamed file would replace or exchange with itself (i.e. the + // source and destination paths resolve to the same file), RenameAt returns + // nil, skipping the checks described below. + // + // - If the source or destination directory is not writable by the provider + // of rp.Credentials(), RenameAt returns EACCES. + // + // - If the renamed file is a directory, and renaming would replace a + // non-directory file, RenameAt returns ENOTDIR. // - // The implementation is responsible for checking that vd.Mount() == - // rp.Mount(). - RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error + // - If the renamed file is not a directory, and renaming would replace a + // directory, RenameAt returns EISDIR. + // + // - If the new parent directory of the renamed file has been removed by + // RmdirAt or a preceding call to RenameAt, RenameAt returns ENOENT. + // + // - If the renamed file is a directory, it is not writable by the + // provider of rp.Credentials(), and the source and destination parent + // directories are different, RenameAt returns EACCES. (This is nominally + // required to change the ".." entry in the renamed directory.) + // + // - If renaming would replace a non-empty directory, RenameAt returns + // ENOTEMPTY. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). oldName is not "." or "..". + // + // Postconditions: If RenameAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + // + // [1] "The worst of all namespace operations - renaming directory. + // "Perverted" doesn't even start to describe it. Somebody in UCB had a + // heck of a trip..." - fs/namei.c:vfs_rename() + RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error // RmdirAt removes the directory at rp. + // + // Errors: + // + // - If the last path component in rp is ".", RmdirAt returns EINVAL. + // + // - If the last path component in rp is "..", RmdirAt returns ENOTEMPTY. + // + // - If no file exists at rp, RmdirAt returns ENOENT. + // + // - If the file at rp exists but is not a directory, RmdirAt returns + // ENOTDIR. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If RmdirAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). RmdirAt(ctx context.Context, rp *ResolvingPath) error // SetStatAt updates metadata for the file at the given path. + // + // Errors: + // + // - If opts specifies unsupported options, SetStatAt returns EINVAL. SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error // StatAt returns metadata for the file at rp. @@ -158,10 +376,132 @@ type FilesystemImpl interface { StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) // SymlinkAt creates a symbolic link at rp referring to the given target. + // + // Errors: + // + // - If the last path component in rp is "." or "..", SymlinkAt returns + // EEXIST. + // + // - If a file already exists at rp, SymlinkAt returns EEXIST. + // + // - If rp.MustBeDir(), SymlinkAt returns ENOENT. + // + // - If the directory in which the symbolic link would be created has been + // removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If SymlinkAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error - // UnlinkAt removes the non-directory file at rp. + // UnlinkAt removes the file at rp. + // + // Errors: + // + // - If the last path component in rp is "." or "..", UnlinkAt returns + // EISDIR. + // + // - If no file exists at rp, UnlinkAt returns ENOENT. + // + // - If rp.MustBeDir(), and the file at rp exists and is not a directory, + // UnlinkAt returns ENOTDIR. + // + // - If the file at rp exists but is a directory, UnlinkAt returns EISDIR. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If UnlinkAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). UnlinkAt(ctx context.Context, rp *ResolvingPath) error - // TODO: d_path(); extended attributes; inotify_add_watch(); bind() + // ListxattrAt returns all extended attribute names for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, + // ListxattrAt returns nil. (See FileDescription.Listxattr for an + // explanation.) + ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) + + // GetxattrAt returns the value associated with the given extended + // attribute for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, GetxattrAt + // returns ENOTSUP. + GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) + + // SetxattrAt changes the value associated with the given extended + // attribute for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, SetxattrAt + // returns ENOTSUP. + SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error + + // RemovexattrAt removes the given extended attribute from the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, + // RemovexattrAt returns ENOTSUP. + RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error + + // PrependPath prepends a path from vd to vd.Mount().Root() to b. + // + // If vfsroot.Ok(), it is the contextual VFS root; if it is encountered + // before vd.Mount().Root(), PrependPath should stop prepending path + // components and return a PrependPathAtVFSRootError. + // + // If traversal of vd.Dentry()'s ancestors encounters an independent + // ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a + // descendant of vd.Mount().Root()), PrependPath should stop prepending + // path components and return a PrependPathAtNonMountRootError. + // + // Filesystems for which Dentries do not have meaningful paths may prepend + // an arbitrary descriptive string to b and then return a + // PrependPathSyntheticError. + // + // Most implementations can acquire the appropriate locks to ensure that + // Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of + // its ancestors, then call GenericPrependPath. + // + // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. + PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error + + // TODO: inotify_add_watch(); bind() +} + +// PrependPathAtVFSRootError is returned by implementations of +// FilesystemImpl.PrependPath() when they encounter the contextual VFS root. +type PrependPathAtVFSRootError struct{} + +// Error implements error.Error. +func (PrependPathAtVFSRootError) Error() string { + return "vfs.FilesystemImpl.PrependPath() reached VFS root" +} + +// PrependPathAtNonMountRootError is returned by implementations of +// FilesystemImpl.PrependPath() when they encounter an independent ancestor +// Dentry that is not the Mount root. +type PrependPathAtNonMountRootError struct{} + +// Error implements error.Error. +func (PrependPathAtNonMountRootError) Error() string { + return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root" +} + +// PrependPathSyntheticError is returned by implementations of +// FilesystemImpl.PrependPath() for which prepended names do not represent real +// paths. +type PrependPathSyntheticError struct{} + +// Error implements error.Error. +func (PrependPathSyntheticError) Error() string { + return "vfs.FilesystemImpl.PrependPath() prepended synthetic name" } diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go index 465e610e0..7315a588e 100644 --- a/pkg/sentry/vfs/filesystem_impl_util.go +++ b/pkg/sentry/vfs/filesystem_impl_util.go @@ -16,6 +16,8 @@ package vfs import ( "strings" + + "gvisor.dev/gvisor/pkg/fspath" ) // GenericParseMountOptions parses a comma-separated list of options of the @@ -41,3 +43,27 @@ func GenericParseMountOptions(str string) map[string]string { } return m } + +// GenericPrependPath may be used by implementations of +// FilesystemImpl.PrependPath() for which a single statically-determined lock +// or set of locks is sufficient to ensure its preconditions (as opposed to +// e.g. per-Dentry locks). +// +// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for +// vd.Dentry() and all of its ancestors. +func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error { + mnt, d := vd.mount, vd.dentry + for { + if mnt == vfsroot.mount && d == vfsroot.dentry { + return PrependPathAtVFSRootError{} + } + if d == mnt.root { + return nil + } + if d.parent == nil { + return PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } +} diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index c335e206d..023301780 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -15,6 +15,7 @@ package vfs import ( + "bytes" "fmt" "gvisor.dev/gvisor/pkg/sentry/context" @@ -43,28 +44,70 @@ type GetFilesystemOptions struct { InternalData interface{} } +type registeredFilesystemType struct { + fsType FilesystemType + opts RegisterFilesystemTypeOptions +} + +// RegisterFilesystemTypeOptions contains options to +// VirtualFilesystem.RegisterFilesystem(). +type RegisterFilesystemTypeOptions struct { + // If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt() + // for which MountOptions.InternalMount == false to use this filesystem + // type. + AllowUserMount bool + + // If AllowUserList is true, make this filesystem type visible in + // /proc/filesystems. + AllowUserList bool + + // If RequiresDevice is true, indicate that mounting this filesystem + // requires a block device as the mount source in /proc/filesystems. + RequiresDevice bool +} + // RegisterFilesystemType registers the given FilesystemType in vfs with the // given name. -func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error { +func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error { vfs.fsTypesMu.Lock() defer vfs.fsTypesMu.Unlock() if existing, ok := vfs.fsTypes[name]; ok { - return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing) + return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType) + } + vfs.fsTypes[name] = ®isteredFilesystemType{ + fsType: fsType, + opts: *opts, } - vfs.fsTypes[name] = fsType return nil } // MustRegisterFilesystemType is equivalent to RegisterFilesystemType but // panics on failure. -func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) { - if err := vfs.RegisterFilesystemType(name, fsType); err != nil { +func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) { + if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil { panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err)) } } -func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType { +func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType { vfs.fsTypesMu.RLock() defer vfs.fsTypesMu.RUnlock() return vfs.fsTypes[name] } + +// GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to +// buf. +func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) { + vfs.fsTypesMu.RLock() + defer vfs.fsTypesMu.RUnlock() + for name, rft := range vfs.fsTypes { + if !rft.opts.AllowUserList { + continue + } + var nodev string + if !rft.opts.RequiresDevice { + nodev = "nodev" + } + fmt.Fprintf(buf, "%s\t%s\n", nodev, name) + } +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 1c3b2e987..00177b371 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -18,6 +18,7 @@ import ( "math" "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" @@ -111,11 +112,11 @@ type MountNamespace struct { // configured by the given arguments. A reference is taken on the returned // MountNamespace. func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { - fsType := vfs.getFilesystemType(fsTypeName) - if fsType == nil { + rft := vfs.getFilesystemType(fsTypeName) + if rft == nil { return nil, syserror.ENODEV } - fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts) + fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts) if err != nil { return nil, err } @@ -133,13 +134,16 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth return mntns, nil } -// NewMount creates and mounts a Filesystem configured by the given arguments. -func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *GetFilesystemOptions) error { - fsType := vfs.getFilesystemType(fsTypeName) - if fsType == nil { +// MountAt creates and mounts a Filesystem configured by the given arguments. +func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { + rft := vfs.getFilesystemType(fsTypeName) + if rft == nil { return syserror.ENODEV } - fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts) + if !opts.InternalMount && !rft.opts.AllowUserMount { + return syserror.ENODEV + } + fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { return err } @@ -207,6 +211,68 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti return nil } +// UmountAt removes the Mount at the given path. +func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { + if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { + return syserror.EINVAL + } + + // MNT_FORCE is currently unimplemented except for the permission check. + if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { + return syserror.EPERM + } + + vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) + if err != nil { + return err + } + defer vd.DecRef() + if vd.dentry != vd.mount.root { + return syserror.EINVAL + } + vfs.mountMu.Lock() + if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns { + vfs.mountMu.Unlock() + return syserror.EINVAL + } + + // TODO(jamieliu): Linux special-cases umount of the caller's root, which + // we don't implement yet (we'll just fail it since the caller holds a + // reference on it). + + vfs.mounts.seq.BeginWrite() + if opts.Flags&linux.MNT_DETACH == 0 { + if len(vd.mount.children) != 0 { + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + return syserror.EBUSY + } + // We are holding a reference on vd.mount. + expectedRefs := int64(1) + if !vd.mount.umounted { + expectedRefs = 2 + } + if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + return syserror.EBUSY + } + } + vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{ + eager: opts.Flags&linux.MNT_DETACH == 0, + disconnectHierarchy: true, + }, nil, nil) + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + for _, vd := range vdsToDecRef { + vd.DecRef() + } + for _, mnt := range mountsToDecRef { + mnt.DecRef() + } + return nil +} + type umountRecursiveOptions struct { // If eager is true, ensure that future calls to Mount.tryIncMountedRef() // on umounted mounts fail. diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 3aa73d911..b7774bf28 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -46,6 +46,16 @@ type MknodOptions struct { DevMinor uint32 } +// MountOptions contains options to VirtualFilesystem.MountAt(). +type MountOptions struct { + // GetFilesystemOptions contains options to FilesystemType.GetFilesystem(). + GetFilesystemOptions GetFilesystemOptions + + // If InternalMount is true, allow the use of filesystem types for which + // RegisterFilesystemTypeOptions.AllowUserMount == false. + InternalMount bool +} + // OpenOptions contains options to VirtualFilesystem.OpenAt() and // FilesystemImpl.OpenAt(). type OpenOptions struct { @@ -77,6 +87,9 @@ type ReadOptions struct { type RenameOptions struct { // Flags contains flags as specified for renameat2(2). Flags uint32 + + // If MustBeDir is true, the renamed file must be a directory. + MustBeDir bool } // SetStatOptions contains options to VirtualFilesystem.SetStatAt(), @@ -95,6 +108,20 @@ type SetStatOptions struct { Stat linux.Statx } +// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(), +// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and +// FileDescriptionImpl.Setxattr(). +type SetxattrOptions struct { + // Name is the name of the extended attribute being mutated. + Name string + + // Value is the extended attribute's new value. + Value string + + // Flags contains flags as specified for setxattr/lsetxattr/fsetxattr(2). + Flags uint32 +} + // StatOptions contains options to VirtualFilesystem.StatAt(), // FilesystemImpl.StatAt(), FileDescription.Stat(), and // FileDescriptionImpl.Stat(). @@ -114,6 +141,12 @@ type StatOptions struct { Sync uint32 } +// UmountOptions contains options to VirtualFilesystem.UmountAt(). +type UmountOptions struct { + // Flags contains flags as specified for umount2(2). + Flags uint32 +} + // WriteOptions contains options to FileDescription.PWrite(), // FileDescriptionImpl.PWrite(), FileDescription.Write(), and // FileDescriptionImpl.Write(). diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go new file mode 100644 index 000000000..8e155654f --- /dev/null +++ b/pkg/sentry/vfs/pathname.go @@ -0,0 +1,153 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +var fspathBuilderPool = sync.Pool{ + New: func() interface{} { + return &fspath.Builder{} + }, +} + +func getFSPathBuilder() *fspath.Builder { + return fspathBuilderPool.Get().(*fspath.Builder) +} + +func putFSPathBuilder(b *fspath.Builder) { + // No methods can be called on b after b.String(), so reset it to its zero + // value (as returned by fspathBuilderPool.New) instead. + *b = fspath.Builder{} + fspathBuilderPool.Put(b) +} + +// PathnameWithDeleted returns an absolute pathname to vd, consistent with +// Linux's d_path(). In particular, if vd.Dentry() has been disowned, +// PathnameWithDeleted appends " (deleted)" to the returned pathname. +func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { + b := getFSPathBuilder() + defer putFSPathBuilder(b) + haveRef := false + defer func() { + if haveRef { + vd.DecRef() + } + }() + + origD := vd.dentry +loop: + for { + err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) + switch err.(type) { + case nil: + if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { + // GenericPrependPath() will have returned + // PrependPathAtVFSRootError in this case since it checks + // against vfsroot before mnt.root, but other implementations + // of FilesystemImpl.PrependPath() may return nil instead. + break loop + } + nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + if !nextVD.Ok() { + break loop + } + if haveRef { + vd.DecRef() + } + vd = nextVD + haveRef = true + // continue loop + case PrependPathSyntheticError: + // Skip prepending "/" and appending " (deleted)". + return b.String(), nil + case PrependPathAtVFSRootError, PrependPathAtNonMountRootError: + break loop + default: + return "", err + } + } + b.PrependByte('/') + if origD.IsDisowned() { + b.AppendString(" (deleted)") + } + return b.String(), nil +} + +// PathnameForGetcwd returns an absolute pathname to vd, consistent with +// Linux's sys_getcwd(). +func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { + if vd.dentry.IsDisowned() { + return "", syserror.ENOENT + } + + b := getFSPathBuilder() + defer putFSPathBuilder(b) + haveRef := false + defer func() { + if haveRef { + vd.DecRef() + } + }() + unreachable := false +loop: + for { + err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) + switch err.(type) { + case nil: + if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { + break loop + } + nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + if !nextVD.Ok() { + unreachable = true + break loop + } + if haveRef { + vd.DecRef() + } + vd = nextVD + haveRef = true + case PrependPathAtVFSRootError: + break loop + case PrependPathAtNonMountRootError, PrependPathSyntheticError: + unreachable = true + break loop + default: + return "", err + } + } + b.PrependByte('/') + if unreachable { + b.PrependString("(unreachable)") + } + return b.String(), nil +} + +// As of this writing, we do not have equivalents to: +// +// - d_absolute_path(), which returns EINVAL if (effectively) any call to +// FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError. +// +// - dentry_path(), which does not walk up mounts (and only returns the path +// relative to Filesystem root), but also appends "//deleted" for disowned +// Dentries. +// +// These should be added as necessary. diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index f8e74355c..f1edb0680 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -119,3 +119,65 @@ func MayWriteFileWithOpenFlags(flags uint32) bool { return false } } + +// CheckSetStat checks that creds has permission to change the metadata of a +// file with the given permissions, UID, and GID as specified by stat, subject +// to the rules of Linux's fs/attr.c:setattr_prepare(). +func CheckSetStat(creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error { + if stat.Mask&linux.STATX_MODE != 0 { + if !CanActAsOwner(creds, kuid) { + return syserror.EPERM + } + // TODO(b/30815691): "If the calling process is not privileged (Linux: + // does not have the CAP_FSETID capability), and the group of the file + // does not match the effective group ID of the process or one of its + // supplementary group IDs, the S_ISGID bit will be turned off, but + // this will not cause an error to be returned." - chmod(2) + } + if stat.Mask&linux.STATX_UID != 0 { + if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) || + HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { + return syserror.EPERM + } + } + if stat.Mask&linux.STATX_GID != 0 { + if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) || + HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { + return syserror.EPERM + } + } + if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 { + if !CanActAsOwner(creds, kuid) { + if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) || + (stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) || + (stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) { + return syserror.EPERM + } + // isDir is irrelevant in the following call to + // GenericCheckPermissions since ats == MayWrite means that + // CAP_DAC_READ_SEARCH does not apply, and CAP_DAC_OVERRIDE + // applies, regardless of isDir. + if err := GenericCheckPermissions(creds, MayWrite, false /* isDir */, mode, kuid, kgid); err != nil { + return err + } + } + } + return nil +} + +// CanActAsOwner returns true if creds can act as the owner of a file with the +// given owning UID, consistent with Linux's +// fs/inode.c:inode_owner_or_capable(). +func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool { + if creds.EffectiveKUID == kuid { + return true + } + return creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(kuid).Ok() +} + +// HasCapabilityOnFile returns true if creds has the given capability with +// respect to a file with the given owning UID and GID, consistent with Linux's +// kernel/capability.c:capable_wrt_inode_uidgid(). +func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool { + return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok() +} diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 621f5a6f8..f0641d314 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -85,11 +85,11 @@ func init() { // so error "constants" are really mutable vars, necessitating somewhat // expensive interface object comparisons. -type resolveMountRootError struct{} +type resolveMountRootOrJumpError struct{} // Error implements error.Error. -func (resolveMountRootError) Error() string { - return "resolving mount root" +func (resolveMountRootOrJumpError) Error() string { + return "resolving mount root or jump" } type resolveMountPointError struct{} @@ -112,30 +112,26 @@ var resolvingPathPool = sync.Pool{ }, } -func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) { - path, err := fspath.Parse(pop.Pathname) - if err != nil { - return nil, err - } +func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath { rp := resolvingPathPool.Get().(*ResolvingPath) rp.vfs = vfs rp.root = pop.Root rp.mount = pop.Start.mount rp.start = pop.Start.dentry - rp.pit = path.Begin + rp.pit = pop.Path.Begin rp.flags = 0 if pop.FollowFinalSymlink { rp.flags |= rpflagsFollowFinalSymlink } - rp.mustBeDir = path.Dir - rp.mustBeDirOrig = path.Dir + rp.mustBeDir = pop.Path.Dir + rp.mustBeDirOrig = pop.Path.Dir rp.symlinks = 0 rp.curPart = 0 rp.numOrigParts = 1 rp.creds = creds - rp.parts[0] = path.Begin - rp.origParts[0] = path.Begin - return rp, nil + rp.parts[0] = pop.Path.Begin + rp.origParts[0] = pop.Path.Begin + return rp } func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { @@ -274,7 +270,7 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) { // ... of non-root mount. rp.nextMount = vd.mount rp.nextStart = vd.dentry - return nil, resolveMountRootError{} + return nil, resolveMountRootOrJumpError{} } // ... of root mount. parent = d @@ -345,29 +341,34 @@ func (rp *ResolvingPath) ShouldFollowSymlink() bool { // symlink target and returns nil. Otherwise it returns a non-nil error. // // Preconditions: !rp.Done(). +// +// Postconditions: If HandleSymlink returns a nil error, then !rp.Done(). func (rp *ResolvingPath) HandleSymlink(target string) error { if rp.symlinks >= linux.MaxSymlinkTraversals { return syserror.ELOOP } - targetPath, err := fspath.Parse(target) - if err != nil { - return err + if len(target) == 0 { + return syserror.ENOENT } rp.symlinks++ + targetPath := fspath.Parse(target) if targetPath.Absolute { rp.absSymlinkTarget = targetPath return resolveAbsSymlinkError{} } - if !targetPath.Begin.Ok() { - panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target)) - } // Consume the path component that represented the symlink. rp.Advance() // Prepend the symlink target to the relative path. + if checkInvariants { + if !targetPath.HasComponents() { + panic(fmt.Sprintf("non-empty pathname %q parsed to relative path with no components", target)) + } + } rp.relpathPrepend(targetPath) return nil } +// Preconditions: path.HasComponents(). func (rp *ResolvingPath) relpathPrepend(path fspath.Path) { if rp.pit.Ok() { rp.parts[rp.curPart] = rp.pit @@ -385,11 +386,32 @@ func (rp *ResolvingPath) relpathPrepend(path fspath.Path) { } } +// HandleJump is called when the current path component is a "magic" link to +// the given VirtualDentry, like /proc/[pid]/fd/[fd]. If the calling Filesystem +// method should continue path traversal, HandleMagicSymlink updates the path +// component stream to reflect the magic link target and returns nil. Otherwise +// it returns a non-nil error. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) HandleJump(target VirtualDentry) error { + if rp.symlinks >= linux.MaxSymlinkTraversals { + return syserror.ELOOP + } + rp.symlinks++ + // Consume the path component that represented the magic link. + rp.Advance() + // Unconditionally return a resolveMountRootOrJumpError, even if the Mount + // isn't changing, to force restarting at the new Dentry. + target.IncRef() + rp.nextMount = target.mount + rp.nextStart = target.dentry + return resolveMountRootOrJumpError{} +} + func (rp *ResolvingPath) handleError(err error) bool { switch err.(type) { - case resolveMountRootError: - // Switch to the new Mount. We hold references on the Mount and Dentry - // (from VFS.getMountpointAt()). + case resolveMountRootOrJumpError: + // Switch to the new Mount. We hold references on the Mount and Dentry. rp.decRefStartAndMount() rp.mount = rp.nextMount rp.start = rp.nextStart @@ -407,9 +429,8 @@ func (rp *ResolvingPath) handleError(err error) bool { return true case resolveMountPointError: - // Switch to the new Mount. We hold a reference on the Mount (from - // VFS.getMountAt()), but borrow the reference on the mount root from - // the Mount. + // Switch to the new Mount. We hold a reference on the Mount, but + // borrow the reference on the mount root from the Mount. rp.decRefStartAndMount() rp.mount = rp.nextMount rp.start = rp.nextMount.root @@ -447,6 +468,17 @@ func (rp *ResolvingPath) handleError(err error) bool { } } +// canHandleError returns true if err is an error returned by rp.Resolve*() +// that rp.handleError() may attempt to handle. +func (rp *ResolvingPath) canHandleError(err error) bool { + switch err.(type) { + case resolveMountRootOrJumpError, resolveMountPointError, resolveAbsSymlinkError: + return true + default: + return false + } +} + // MustBeDir returns true if the file traversed by rp must be a directory. func (rp *ResolvingPath) MustBeDir() bool { return rp.mustBeDir diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go deleted file mode 100644 index 436151afa..000000000 --- a/pkg/sentry/vfs/syscalls.go +++ /dev/null @@ -1,237 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" -) - -// PathOperation specifies the path operated on by a VFS method. -// -// PathOperation is passed to VFS methods by pointer to reduce memory copying: -// it's somewhat large and should never escape. (Options structs are passed by -// pointer to VFS and FileDescription methods for the same reason.) -type PathOperation struct { - // Root is the VFS root. References on Root are borrowed from the provider - // of the PathOperation. - // - // Invariants: Root.Ok(). - Root VirtualDentry - - // Start is the starting point for the path traversal. References on Start - // are borrowed from the provider of the PathOperation (i.e. the caller of - // the VFS method to which the PathOperation was passed). - // - // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root. - Start VirtualDentry - - // Path is the pathname traversed by this operation. - Pathname string - - // If FollowFinalSymlink is true, and the Dentry traversed by the final - // path component represents a symbolic link, the symbolic link should be - // followed. - FollowFinalSymlink bool -} - -// GetDentryAt returns a VirtualDentry representing the given path, at which a -// file must exist. A reference is taken on the returned VirtualDentry. -func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return VirtualDentry{}, err - } - for { - d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) - if err == nil { - vd := VirtualDentry{ - mount: rp.mount, - dentry: d, - } - rp.mount.IncRef() - vfs.putResolvingPath(rp) - return vd, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return VirtualDentry{}, err - } - } -} - -// MkdirAt creates a directory at the given path. -func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { - // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is - // also honored." - mkdir(2) - opts.Mode &= 01777 - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return err - } - for { - err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return err - } - } -} - -// MknodAt creates a file of the given mode at the given path. It returns an -// error from the syserror package. -func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return nil - } - for { - if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil { - vfs.putResolvingPath(rp) - return nil - } - // Handle mount traversals. - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return err - } - } -} - -// OpenAt returns a FileDescription providing access to the file at the given -// path. A reference is taken on the returned FileDescription. -func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { - // Remove: - // - // - O_LARGEFILE, which we always report in FileDescription status flags - // since only 64-bit architectures are supported at this time. - // - // - O_CLOEXEC, which affects file descriptors and therefore must be - // handled outside of VFS. - // - // - Unknown flags. - opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE - // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. - if opts.Flags&linux.O_SYNC != 0 { - opts.Flags |= linux.O_DSYNC - } - // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified - // with O_DIRECTORY and a writable access mode (to ensure that it fails on - // filesystem implementations that do not support it). - if opts.Flags&linux.O_TMPFILE != 0 { - if opts.Flags&linux.O_DIRECTORY == 0 { - return nil, syserror.EINVAL - } - if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EINVAL - } - if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { - return nil, syserror.EINVAL - } - } - // O_PATH causes most other flags to be ignored. - if opts.Flags&linux.O_PATH != 0 { - opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH - } - // "On Linux, the following bits are also honored in mode: [S_ISUID, - // S_ISGID, S_ISVTX]" - open(2) - opts.Mode &= 07777 - - if opts.Flags&linux.O_NOFOLLOW != 0 { - pop.FollowFinalSymlink = false - } - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return nil, err - } - if opts.Flags&linux.O_DIRECTORY != 0 { - rp.mustBeDir = true - rp.mustBeDirOrig = true - } - for { - fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return fd, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return nil, err - } - } -} - -// StatAt returns metadata for the file at the given path. -func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return linux.Statx{}, err - } - for { - stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return stat, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return linux.Statx{}, err - } - } -} - -// StatusFlags returns file description status flags. -func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) { - flags, err := fd.impl.StatusFlags(ctx) - flags |= linux.O_LARGEFILE - return flags, err -} - -// SetStatusFlags sets file description status flags. -func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - return fd.impl.SetStatusFlags(ctx, flags) -} - -// TODO: -// -// - VFS.SyncAllFilesystems() for sync(2) -// -// - Something for syncfs(2) -// -// - VFS.LinkAt() -// -// - VFS.ReadlinkAt() -// -// - VFS.RenameAt() -// -// - VFS.RmdirAt() -// -// - VFS.SetStatAt() -// -// - VFS.StatFSAt() -// -// - VFS.SymlinkAt() -// -// - VFS.UmountAt() -// -// - VFS.UnlinkAt() -// -// - FileDescription.(almost everything) diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go index 593144cb7..ee5c8b9e2 100644 --- a/pkg/sentry/vfs/testutil.go +++ b/pkg/sentry/vfs/testutil.go @@ -15,7 +15,10 @@ package vfs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" @@ -54,6 +57,11 @@ func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, return nil, syserror.EPERM } +// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt. +func (fs *FDTestFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) { + return nil, syserror.EPERM +} + // LinkAt implements FilesystemImpl.LinkAt. func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error { return syserror.EPERM @@ -80,7 +88,7 @@ func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) ( } // RenameAt implements FilesystemImpl.RenameAt. -func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error { +func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error { return syserror.EPERM } @@ -114,6 +122,32 @@ func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) err return syserror.EPERM } +// ListxattrAt implements FilesystemImpl.ListxattrAt. +func (fs *FDTestFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) { + return nil, syserror.EPERM +} + +// GetxattrAt implements FilesystemImpl.GetxattrAt. +func (fs *FDTestFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) { + return "", syserror.EPERM +} + +// SetxattrAt implements FilesystemImpl.SetxattrAt. +func (fs *FDTestFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error { + return syserror.EPERM +} + +// RemovexattrAt implements FilesystemImpl.RemovexattrAt. +func (fs *FDTestFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error { + return syserror.EPERM +} + +// PrependPath implements FilesystemImpl.PrependPath. +func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error { + b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry))) + return PrependPathSyntheticError{} +} + type fdTestDentry struct { vfsd Dentry } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index f0cd3ffe5..ea2db7031 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -20,6 +20,7 @@ // VirtualFilesystem.mountMu // Dentry.mu // Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry +// VirtualFilesystem.filesystemsMu // VirtualFilesystem.fsTypesMu // // Locking Dentry.mu in multiple Dentries requires holding @@ -27,7 +28,14 @@ package vfs import ( + "fmt" "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" ) // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. @@ -67,22 +75,598 @@ type VirtualFilesystem struct { // mountpoints is analogous to Linux's mountpoint_hashtable. mountpoints map[*Dentry]map[*Mount]struct{} - // fsTypes contains all FilesystemTypes that are usable in the - // VirtualFilesystem. fsTypes is protected by fsTypesMu. + // devices contains all registered Devices. devices is protected by + // devicesMu. + devicesMu sync.RWMutex + devices map[devTuple]*registeredDevice + + // fsTypes contains all registered FilesystemTypes. fsTypes is protected by + // fsTypesMu. fsTypesMu sync.RWMutex - fsTypes map[string]FilesystemType + fsTypes map[string]*registeredFilesystemType + + // filesystems contains all Filesystems. filesystems is protected by + // filesystemsMu. + filesystemsMu sync.Mutex + filesystems map[*Filesystem]struct{} } // New returns a new VirtualFilesystem with no mounts or FilesystemTypes. func New() *VirtualFilesystem { vfs := &VirtualFilesystem{ mountpoints: make(map[*Dentry]map[*Mount]struct{}), - fsTypes: make(map[string]FilesystemType), + devices: make(map[devTuple]*registeredDevice), + fsTypes: make(map[string]*registeredFilesystemType), + filesystems: make(map[*Filesystem]struct{}), } vfs.mounts.Init() return vfs } +// PathOperation specifies the path operated on by a VFS method. +// +// PathOperation is passed to VFS methods by pointer to reduce memory copying: +// it's somewhat large and should never escape. (Options structs are passed by +// pointer to VFS and FileDescription methods for the same reason.) +type PathOperation struct { + // Root is the VFS root. References on Root are borrowed from the provider + // of the PathOperation. + // + // Invariants: Root.Ok(). + Root VirtualDentry + + // Start is the starting point for the path traversal. References on Start + // are borrowed from the provider of the PathOperation (i.e. the caller of + // the VFS method to which the PathOperation was passed). + // + // Invariants: Start.Ok(). If Path.Absolute, then Start == Root. + Start VirtualDentry + + // Path is the pathname traversed by this operation. + Path fspath.Path + + // If FollowFinalSymlink is true, and the Dentry traversed by the final + // path component represents a symbolic link, the symbolic link should be + // followed. + FollowFinalSymlink bool +} + +// GetDentryAt returns a VirtualDentry representing the given path, at which a +// file must exist. A reference is taken on the returned VirtualDentry. +func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) + if err == nil { + vd := VirtualDentry{ + mount: rp.mount, + dentry: d, + } + rp.mount.IncRef() + vfs.putResolvingPath(rp) + return vd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return VirtualDentry{}, err + } + } +} + +// Preconditions: pop.Path.Begin.Ok(). +func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp) + if err == nil { + parentVD := VirtualDentry{ + mount: rp.mount, + dentry: parent, + } + rp.mount.IncRef() + name := rp.Component() + vfs.putResolvingPath(rp) + return parentVD, name, nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return VirtualDentry{}, "", err + } + } +} + +// LinkAt creates a hard link at newpop representing the existing file at +// oldpop. +func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error { + oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) + if err != nil { + return err + } + + if !newpop.Path.Begin.Ok() { + oldVD.DecRef() + if newpop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT + } + if newpop.FollowFinalSymlink { + oldVD.DecRef() + ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, newpop) + for { + err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) + if err == nil { + vfs.putResolvingPath(rp) + oldVD.DecRef() + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + oldVD.DecRef() + return err + } + } +} + +// MkdirAt creates a directory at the given path. +func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink") + return syserror.EINVAL + } + // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is + // also honored." - mkdir(2) + opts.Mode &= 0777 | linux.S_ISVTX + + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// MknodAt creates a file of the given mode at the given path. It returns an +// error from the syserror package. +func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) + if err != nil { + vfs.putResolvingPath(rp) + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// OpenAt returns a FileDescription providing access to the file at the given +// path. A reference is taken on the returned FileDescription. +func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { + // Remove: + // + // - O_LARGEFILE, which we always report in FileDescription status flags + // since only 64-bit architectures are supported at this time. + // + // - O_CLOEXEC, which affects file descriptors and therefore must be + // handled outside of VFS. + // + // - Unknown flags. + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE + // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. + if opts.Flags&linux.O_SYNC != 0 { + opts.Flags |= linux.O_DSYNC + } + // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified + // with O_DIRECTORY and a writable access mode (to ensure that it fails on + // filesystem implementations that do not support it). + if opts.Flags&linux.O_TMPFILE != 0 { + if opts.Flags&linux.O_DIRECTORY == 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { + return nil, syserror.EINVAL + } + } + // O_PATH causes most other flags to be ignored. + if opts.Flags&linux.O_PATH != 0 { + opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH + } + // "On Linux, the following bits are also honored in mode: [S_ISUID, + // S_ISGID, S_ISVTX]" - open(2) + opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX + + if opts.Flags&linux.O_NOFOLLOW != 0 { + pop.FollowFinalSymlink = false + } + rp := vfs.getResolvingPath(creds, pop) + if opts.Flags&linux.O_DIRECTORY != 0 { + rp.mustBeDir = true + rp.mustBeDirOrig = true + } + for { + fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return fd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// ReadlinkAt returns the target of the symbolic link at the given path. +func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return target, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return "", err + } + } +} + +// RenameAt renames the file at oldpop to newpop. +func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { + if !oldpop.Path.Begin.Ok() { + if oldpop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT + } + if oldpop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink") + return syserror.EINVAL + } + + oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop) + if err != nil { + return err + } + if oldName == "." || oldName == ".." { + oldParentVD.DecRef() + return syserror.EBUSY + } + + if !newpop.Path.Begin.Ok() { + oldParentVD.DecRef() + if newpop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT + } + if newpop.FollowFinalSymlink { + oldParentVD.DecRef() + ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, newpop) + renameOpts := *opts + if oldpop.Path.Dir { + renameOpts.MustBeDir = true + } + for { + err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) + if err == nil { + vfs.putResolvingPath(rp) + oldParentVD.DecRef() + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + oldParentVD.DecRef() + return err + } + } +} + +// RmdirAt removes the directory at the given path. +func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.RmdirAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// SetStatAt changes metadata for the file at the given path. +func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error { + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// StatAt returns metadata for the file at the given path. +func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return stat, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return linux.Statx{}, err + } + } +} + +// StatFSAt returns metadata for the filesystem containing the file at the +// given path. +func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return statfs, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return linux.Statfs{}, err + } + } +} + +// SymlinkAt creates a symbolic link at the given path with the given target. +func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// UnlinkAt deletes the non-directory file at the given path. +func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.UnlinkAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// ListxattrAt returns all extended attribute names for the file at the given +// path. +func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return names, nil + } + if err == syserror.ENOTSUP { + // Linux doesn't actually return ENOTSUP in this case; instead, + // fs/xattr.c:vfs_listxattr() falls back to allowing the security + // subsystem to return security extended attributes, which by + // default don't exist. + vfs.putResolvingPath(rp) + return nil, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// GetxattrAt returns the value associated with the given extended attribute +// for the file at the given path. +func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name) + if err == nil { + vfs.putResolvingPath(rp) + return val, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return "", err + } + } +} + +// SetxattrAt changes the value associated with the given extended attribute +// for the file at the given path. +func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error { + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// RemovexattrAt removes the given extended attribute from the file at rp. +func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// SyncAllFilesystems has the semantics of Linux's sync(2). +func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { + fss := make(map[*Filesystem]struct{}) + vfs.filesystemsMu.Lock() + for fs := range vfs.filesystems { + if !fs.TryIncRef() { + continue + } + fss[fs] = struct{}{} + } + vfs.filesystemsMu.Unlock() + var retErr error + for fs := range fss { + if err := fs.impl.Sync(ctx); err != nil && retErr == nil { + retErr = err + } + fs.DecRef() + } + return retErr +} + // A VirtualDentry represents a node in a VFS tree, by combining a Dentry // (which represents a node in a Filesystem's tree) and a Mount (which // represents the Filesystem's position in a VFS mount tree). diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go index ecce6c69f..5e4611333 100644 --- a/pkg/sentry/watchdog/watchdog.go +++ b/pkg/sentry/watchdog/watchdog.go @@ -287,7 +287,9 @@ func (w *Watchdog) runTurn() { if !ok { // New stuck task detected. // - // TODO(b/65849403): Tasks blocked doing IO may be considered stuck in kernel. + // Note that tasks blocked doing IO may be considered stuck in kernel, + // unless they are surrounded b + // Task.UninterruptibleSleepStart/Finish. tc = &offender{lastUpdateTime: lastUpdateTime} stuckTasks.Increment() newTaskFound = true diff --git a/pkg/syncutil/BUILD b/pkg/syncutil/BUILD index b06a90bef..cb1f41628 100644 --- a/pkg/syncutil/BUILD +++ b/pkg/syncutil/BUILD @@ -31,8 +31,6 @@ go_template( go_library( name = "syncutil", srcs = [ - "downgradable_rwmutex_1_12_unsafe.go", - "downgradable_rwmutex_1_13_unsafe.go", "downgradable_rwmutex_unsafe.go", "memmove_unsafe.go", "norace_unsafe.go", diff --git a/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go deleted file mode 100644 index 7c6336e62..000000000 --- a/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Copyright 2019 The gVisor Authors. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build go1.12 -// +build !go1.13 - -// TODO(b/133868570): Delete once Go 1.12 is no longer supported. - -package syncutil - -import _ "unsafe" - -//go:linkname runtimeSemrelease112 sync.runtime_Semrelease -func runtimeSemrelease112(s *uint32, handoff bool) - -func runtimeSemrelease(s *uint32, handoff bool, skipframes int) { - // 'skipframes' is only available starting from 1.13. - runtimeSemrelease112(s, handoff) -} diff --git a/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go deleted file mode 100644 index 3c3673119..000000000 --- a/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Copyright 2019 The gVisor Authors. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build go1.13 -// +build !go1.15 - -// Check go:linkname function signatures when updating Go version. - -package syncutil - -import _ "unsafe" - -//go:linkname runtimeSemrelease sync.runtime_Semrelease -func runtimeSemrelease(s *uint32, handoff bool, skipframes int) diff --git a/pkg/syncutil/downgradable_rwmutex_unsafe.go b/pkg/syncutil/downgradable_rwmutex_unsafe.go index 07feca402..51e11555d 100644 --- a/pkg/syncutil/downgradable_rwmutex_unsafe.go +++ b/pkg/syncutil/downgradable_rwmutex_unsafe.go @@ -3,7 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build go1.12 +// +build go1.13 // +build !go1.15 // Check go:linkname function signatures when updating Go version. @@ -27,6 +27,9 @@ import ( //go:linkname runtimeSemacquire sync.runtime_Semacquire func runtimeSemacquire(s *uint32) +//go:linkname runtimeSemrelease sync.runtime_Semrelease +func runtimeSemrelease(s *uint32, handoff bool, skipframes int) + // DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock // method. type DowngradableRWMutex struct { diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index 1987e89cc..2269f6237 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -45,6 +45,7 @@ var ( ELIBBAD = error(syscall.ELIBBAD) ELOOP = error(syscall.ELOOP) EMFILE = error(syscall.EMFILE) + EMLINK = error(syscall.EMLINK) EMSGSIZE = error(syscall.EMSGSIZE) ENAMETOOLONG = error(syscall.ENAMETOOLONG) ENOATTR = ENODATA diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD index a3485b35c..f2061c778 100644 --- a/pkg/tcpip/header/BUILD +++ b/pkg/tcpip/header/BUILD @@ -38,12 +38,16 @@ go_test( size = "small", srcs = [ "checksum_test.go", + "ipv6_test.go", "ipversion_test.go", "tcp_test.go", ], deps = [ ":header", + "//pkg/rand", + "//pkg/tcpip", "//pkg/tcpip/buffer", + "@com_github_google_go-cmp//cmp:go_default_library", ], ) @@ -55,5 +59,8 @@ go_test( "ndp_test.go", ], embed = [":header"], - deps = ["//pkg/tcpip"], + deps = [ + "//pkg/tcpip", + "@com_github_google_go-cmp//cmp:go_default_library", + ], ) diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go index 0caa51c1e..135a60b12 100644 --- a/pkg/tcpip/header/ipv6.go +++ b/pkg/tcpip/header/ipv6.go @@ -15,6 +15,7 @@ package header import ( + "crypto/sha256" "encoding/binary" "strings" @@ -90,6 +91,23 @@ const ( // IPv6Any is the non-routable IPv6 "any" meta address. It is also // known as the unspecified address. IPv6Any tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + + // IIDSize is the size of an interface identifier (IID), in bytes, as + // defined by RFC 4291 section 2.5.1. + IIDSize = 8 + + // IIDOffsetInIPv6Address is the offset, in bytes, from the start + // of an IPv6 address to the beginning of the interface identifier + // (IID) for auto-generated addresses. That is, all bytes before + // the IIDOffsetInIPv6Address-th byte are the prefix bytes, and all + // bytes including and after the IIDOffsetInIPv6Address-th byte are + // for the IID. + IIDOffsetInIPv6Address = 8 + + // OpaqueIIDSecretKeyMinBytes is the recommended minimum number of bytes + // for the secret key used to generate an opaque interface identifier as + // outlined by RFC 7217. + OpaqueIIDSecretKeyMinBytes = 16 ) // IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the @@ -266,27 +284,43 @@ func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address { return solicitedNodeMulticastPrefix + addr[len(addr)-3:] } +// EthernetAdddressToModifiedEUI64IntoBuf populates buf with a modified EUI-64 +// from a 48-bit Ethernet/MAC address, as per RFC 4291 section 2.5.1. +// +// buf MUST be at least 8 bytes. +func EthernetAdddressToModifiedEUI64IntoBuf(linkAddr tcpip.LinkAddress, buf []byte) { + buf[0] = linkAddr[0] ^ 2 + buf[1] = linkAddr[1] + buf[2] = linkAddr[2] + buf[3] = 0xFF + buf[4] = 0xFE + buf[5] = linkAddr[3] + buf[6] = linkAddr[4] + buf[7] = linkAddr[5] +} + +// EthernetAddressToModifiedEUI64 computes a modified EUI-64 from a 48-bit +// Ethernet/MAC address, as per RFC 4291 section 2.5.1. +func EthernetAddressToModifiedEUI64(linkAddr tcpip.LinkAddress) [IIDSize]byte { + var buf [IIDSize]byte + EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:]) + return buf +} + // LinkLocalAddr computes the default IPv6 link-local address from a link-layer // (MAC) address. func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address { - // Convert a 48-bit MAC to an EUI-64 and then prepend the link-local - // header, FE80::. + // Convert a 48-bit MAC to a modified EUI-64 and then prepend the + // link-local header, FE80::. // // The conversion is very nearly: // aa:bb:cc:dd:ee:ff => FE80::Aabb:ccFF:FEdd:eeff // Note the capital A. The conversion aa->Aa involves a bit flip. - lladdrb := [16]byte{ - 0: 0xFE, - 1: 0x80, - 8: linkAddr[0] ^ 2, - 9: linkAddr[1], - 10: linkAddr[2], - 11: 0xFF, - 12: 0xFE, - 13: linkAddr[3], - 14: linkAddr[4], - 15: linkAddr[5], + lladdrb := [IPv6AddressSize]byte{ + 0: 0xFE, + 1: 0x80, } + EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, lladdrb[IIDOffsetInIPv6Address:]) return tcpip.Address(lladdrb[:]) } @@ -298,3 +332,42 @@ func IsV6LinkLocalAddress(addr tcpip.Address) bool { } return addr[0] == 0xfe && (addr[1]&0xc0) == 0x80 } + +// AppendOpaqueInterfaceIdentifier appends a 64 bit opaque interface identifier +// (IID) to buf as outlined by RFC 7217 and returns the extended buffer. +// +// The opaque IID is generated from the cryptographic hash of the concatenation +// of the prefix, NIC's name, DAD counter (DAD retry counter) and the secret +// key. The secret key SHOULD be at least OpaqueIIDSecretKeyMinBytes bytes and +// MUST be generated to a pseudo-random number. See RFC 4086 for randomness +// requirements for security. +// +// If buf has enough capacity for the IID (IIDSize bytes), a new underlying +// array for the buffer will not be allocated. +func AppendOpaqueInterfaceIdentifier(buf []byte, prefix tcpip.Subnet, nicName string, dadCounter uint8, secretKey []byte) []byte { + // As per RFC 7217 section 5, the opaque identifier can be generated as a + // cryptographic hash of the concatenation of each of the function parameters. + // Note, we omit the optional Network_ID field. + h := sha256.New() + // h.Write never returns an error. + h.Write([]byte(prefix.ID()[:IIDOffsetInIPv6Address])) + h.Write([]byte(nicName)) + h.Write([]byte{dadCounter}) + h.Write(secretKey) + + var sumBuf [sha256.Size]byte + sum := h.Sum(sumBuf[:0]) + + return append(buf, sum[:IIDSize]...) +} + +// LinkLocalAddrWithOpaqueIID computes the default IPv6 link-local address with +// an opaque IID. +func LinkLocalAddrWithOpaqueIID(nicName string, dadCounter uint8, secretKey []byte) tcpip.Address { + lladdrb := [IPv6AddressSize]byte{ + 0: 0xFE, + 1: 0x80, + } + + return tcpip.Address(AppendOpaqueInterfaceIdentifier(lladdrb[:IIDOffsetInIPv6Address], IPv6LinkLocalPrefix.Subnet(), nicName, dadCounter, secretKey)) +} diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go new file mode 100644 index 000000000..1994003ed --- /dev/null +++ b/pkg/tcpip/header/ipv6_test.go @@ -0,0 +1,208 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package header_test + +import ( + "bytes" + "crypto/sha256" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" +) + +const linkAddr = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06") + +func TestEthernetAdddressToModifiedEUI64(t *testing.T) { + expectedIID := [header.IIDSize]byte{0, 2, 3, 255, 254, 4, 5, 6} + + if diff := cmp.Diff(expectedIID, header.EthernetAddressToModifiedEUI64(linkAddr)); diff != "" { + t.Errorf("EthernetAddressToModifiedEUI64(%s) mismatch (-want +got):\n%s", linkAddr, diff) + } + + var buf [header.IIDSize]byte + header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:]) + if diff := cmp.Diff(expectedIID, buf); diff != "" { + t.Errorf("EthernetAddressToModifiedEUI64IntoBuf(%s, _) mismatch (-want +got):\n%s", linkAddr, diff) + } +} + +func TestLinkLocalAddr(t *testing.T) { + if got, want := header.LinkLocalAddr(linkAddr), tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x02\x03\xff\xfe\x04\x05\x06"); got != want { + t.Errorf("got LinkLocalAddr(%s) = %s, want = %s", linkAddr, got, want) + } +} + +func TestAppendOpaqueInterfaceIdentifier(t *testing.T) { + var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes * 2]byte + if n, err := rand.Read(secretKeyBuf[:]); err != nil { + t.Fatalf("rand.Read(_): %s", err) + } else if want := header.OpaqueIIDSecretKeyMinBytes * 2; n != want { + t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", want, n) + } + + tests := []struct { + name string + prefix tcpip.Subnet + nicName string + dadCounter uint8 + secretKey []byte + }{ + { + name: "SecretKey of minimum size", + prefix: header.IPv6LinkLocalPrefix.Subnet(), + nicName: "eth0", + dadCounter: 0, + secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes], + }, + { + name: "SecretKey of less than minimum size", + prefix: func() tcpip.Subnet { + addrWithPrefix := tcpip.AddressWithPrefix{ + Address: "\x01\x02\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + PrefixLen: header.IIDOffsetInIPv6Address * 8, + } + return addrWithPrefix.Subnet() + }(), + nicName: "eth10", + dadCounter: 1, + secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes/2], + }, + { + name: "SecretKey of more than minimum size", + prefix: func() tcpip.Subnet { + addrWithPrefix := tcpip.AddressWithPrefix{ + Address: "\x01\x02\x03\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + PrefixLen: header.IIDOffsetInIPv6Address * 8, + } + return addrWithPrefix.Subnet() + }(), + nicName: "eth11", + dadCounter: 2, + secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes*2], + }, + { + name: "Nil SecretKey and empty nicName", + prefix: func() tcpip.Subnet { + addrWithPrefix := tcpip.AddressWithPrefix{ + Address: "\x01\x02\x03\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + PrefixLen: header.IIDOffsetInIPv6Address * 8, + } + return addrWithPrefix.Subnet() + }(), + nicName: "", + dadCounter: 3, + secretKey: nil, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + h := sha256.New() + h.Write([]byte(test.prefix.ID()[:header.IIDOffsetInIPv6Address])) + h.Write([]byte(test.nicName)) + h.Write([]byte{test.dadCounter}) + if k := test.secretKey; k != nil { + h.Write(k) + } + var hashSum [sha256.Size]byte + h.Sum(hashSum[:0]) + want := hashSum[:header.IIDSize] + + // Passing a nil buffer should result in a new buffer returned with the + // IID. + if got := header.AppendOpaqueInterfaceIdentifier(nil, test.prefix, test.nicName, test.dadCounter, test.secretKey); !bytes.Equal(got, want) { + t.Errorf("got AppendOpaqueInterfaceIdentifier(nil, %s, %s, %d, %x) = %x, want = %x", test.prefix, test.nicName, test.dadCounter, test.secretKey, got, want) + } + + // Passing a buffer with sufficient capacity for the IID should populate + // the buffer provided. + var iidBuf [header.IIDSize]byte + if got := header.AppendOpaqueInterfaceIdentifier(iidBuf[:0], test.prefix, test.nicName, test.dadCounter, test.secretKey); !bytes.Equal(got, want) { + t.Errorf("got AppendOpaqueInterfaceIdentifier(iidBuf[:0], %s, %s, %d, %x) = %x, want = %x", test.prefix, test.nicName, test.dadCounter, test.secretKey, got, want) + } + if got := iidBuf[:]; !bytes.Equal(got, want) { + t.Errorf("got iidBuf = %x, want = %x", got, want) + } + }) + } +} + +func TestLinkLocalAddrWithOpaqueIID(t *testing.T) { + var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes * 2]byte + if n, err := rand.Read(secretKeyBuf[:]); err != nil { + t.Fatalf("rand.Read(_): %s", err) + } else if want := header.OpaqueIIDSecretKeyMinBytes * 2; n != want { + t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", want, n) + } + + prefix := header.IPv6LinkLocalPrefix.Subnet() + + tests := []struct { + name string + prefix tcpip.Subnet + nicName string + dadCounter uint8 + secretKey []byte + }{ + { + name: "SecretKey of minimum size", + nicName: "eth0", + dadCounter: 0, + secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes], + }, + { + name: "SecretKey of less than minimum size", + nicName: "eth10", + dadCounter: 1, + secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes/2], + }, + { + name: "SecretKey of more than minimum size", + nicName: "eth11", + dadCounter: 2, + secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes*2], + }, + { + name: "Nil SecretKey and empty nicName", + nicName: "", + dadCounter: 3, + secretKey: nil, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + addrBytes := [header.IPv6AddressSize]byte{ + 0: 0xFE, + 1: 0x80, + } + + want := tcpip.Address(header.AppendOpaqueInterfaceIdentifier( + addrBytes[:header.IIDOffsetInIPv6Address], + prefix, + test.nicName, + test.dadCounter, + test.secretKey, + )) + + if got := header.LinkLocalAddrWithOpaqueIID(test.nicName, test.dadCounter, test.secretKey); got != want { + t.Errorf("got LinkLocalAddrWithOpaqueIID(%s, %d, %x) = %s, want = %s", test.nicName, test.dadCounter, test.secretKey, got, want) + } + }) + } +} diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go index 1ca6199ef..06e0bace2 100644 --- a/pkg/tcpip/header/ndp_options.go +++ b/pkg/tcpip/header/ndp_options.go @@ -17,6 +17,7 @@ package header import ( "encoding/binary" "errors" + "math" "time" "gvisor.dev/gvisor/pkg/tcpip" @@ -85,6 +86,23 @@ const ( // within an NDPPrefixInformation. ndpPrefixInformationPrefixOffset = 14 + // NDPRecursiveDNSServerOptionType is the type of the Recursive DNS + // Server option, as per RFC 8106 section 5.1. + NDPRecursiveDNSServerOptionType = 25 + + // ndpRecursiveDNSServerLifetimeOffset is the start of the 4-byte + // Lifetime field within an NDPRecursiveDNSServer. + ndpRecursiveDNSServerLifetimeOffset = 2 + + // ndpRecursiveDNSServerAddressesOffset is the start of the addresses + // for IPv6 Recursive DNS Servers within an NDPRecursiveDNSServer. + ndpRecursiveDNSServerAddressesOffset = 6 + + // minNDPRecursiveDNSServerLength is the minimum NDP Recursive DNS + // Server option's length field value when it contains at least one + // IPv6 address. + minNDPRecursiveDNSServerLength = 3 + // lengthByteUnits is the multiplier factor for the Length field of an // NDP option. That is, the length field for NDP options is in units of // 8 octets, as per RFC 4861 section 4.6. @@ -92,13 +110,13 @@ const ( ) var ( - // NDPPrefixInformationInfiniteLifetime is a value that represents - // infinity for the Valid and Preferred Lifetime fields in a NDP Prefix - // Information option. Its value is (2^32 - 1)s = 4294967295s + // NDPInfiniteLifetime is a value that represents infinity for the + // 4-byte lifetime fields found in various NDP options. Its value is + // (2^32 - 1)s = 4294967295s. // // This is a variable instead of a constant so that tests can change // this value to a smaller value. It should only be modified by tests. - NDPPrefixInformationInfiniteLifetime = time.Second * 4294967295 + NDPInfiniteLifetime = time.Second * math.MaxUint32 ) // NDPOptionIterator is an iterator of NDPOption. @@ -118,6 +136,7 @@ var ( ErrNDPOptBufExhausted = errors.New("Buffer unexpectedly exhausted") ErrNDPOptZeroLength = errors.New("NDP option has zero-valued Length field") ErrNDPOptMalformedBody = errors.New("NDP option has a malformed body") + ErrNDPInvalidLength = errors.New("NDP option's Length value is invalid as per relevant RFC") ) // Next returns the next element in the backing NDPOptions, or true if we are @@ -182,6 +201,22 @@ func (i *NDPOptionIterator) Next() (NDPOption, bool, error) { } return NDPPrefixInformation(body), false, nil + + case NDPRecursiveDNSServerOptionType: + // RFC 8106 section 5.3.1 outlines that the RDNSS option + // must have a minimum length of 3 so it contains at + // least one IPv6 address. + if l < minNDPRecursiveDNSServerLength { + return nil, true, ErrNDPInvalidLength + } + + opt := NDPRecursiveDNSServer(body) + if len(opt.Addresses()) == 0 { + return nil, true, ErrNDPOptMalformedBody + } + + return opt, false, nil + default: // We do not yet recognize the option, just skip for // now. This is okay because RFC 4861 allows us to @@ -434,7 +469,7 @@ func (o NDPPrefixInformation) AutonomousAddressConfigurationFlag() bool { // // Note, a value of 0 implies the prefix should not be considered as on-link, // and a value of infinity/forever is represented by -// NDPPrefixInformationInfiniteLifetime. +// NDPInfiniteLifetime. func (o NDPPrefixInformation) ValidLifetime() time.Duration { // The field is the time in seconds, as per RFC 4861 section 4.6.2. return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationValidLifetimeOffset:])) @@ -447,7 +482,7 @@ func (o NDPPrefixInformation) ValidLifetime() time.Duration { // // Note, a value of 0 implies that addresses generated from the prefix should // no longer remain preferred, and a value of infinity is represented by -// NDPPrefixInformationInfiniteLifetime. +// NDPInfiniteLifetime. // // Also note that the value of this field MUST NOT exceed the Valid Lifetime // field to avoid preferring addresses that are no longer valid, for the @@ -476,3 +511,79 @@ func (o NDPPrefixInformation) Subnet() tcpip.Subnet { } return addrWithPrefix.Subnet() } + +// NDPRecursiveDNSServer is the NDP Recursive DNS Server option, as defined by +// RFC 8106 section 5.1. +// +// To make sure that the option meets its minimum length and does not end in the +// middle of a DNS server's IPv6 address, the length of a valid +// NDPRecursiveDNSServer must meet the following constraint: +// (Length - ndpRecursiveDNSServerAddressesOffset) % IPv6AddressSize == 0 +type NDPRecursiveDNSServer []byte + +// Type returns the type of an NDP Recursive DNS Server option. +// +// Type implements NDPOption.Type. +func (NDPRecursiveDNSServer) Type() uint8 { + return NDPRecursiveDNSServerOptionType +} + +// Length implements NDPOption.Length. +func (o NDPRecursiveDNSServer) Length() int { + return len(o) +} + +// serializeInto implements NDPOption.serializeInto. +func (o NDPRecursiveDNSServer) serializeInto(b []byte) int { + used := copy(b, o) + + // Zero out the reserved bytes that are before the Lifetime field. + for i := 0; i < ndpRecursiveDNSServerLifetimeOffset; i++ { + b[i] = 0 + } + + return used +} + +// Lifetime returns the length of time that the DNS server addresses +// in this option may be used for name resolution. +// +// Note, a value of 0 implies the addresses should no longer be used, +// and a value of infinity/forever is represented by NDPInfiniteLifetime. +// +// Lifetime may panic if o does not have enough bytes to hold the Lifetime +// field. +func (o NDPRecursiveDNSServer) Lifetime() time.Duration { + // The field is the time in seconds, as per RFC 8106 section 5.1. + return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpRecursiveDNSServerLifetimeOffset:])) +} + +// Addresses returns the recursive DNS server IPv6 addresses that may be +// used for name resolution. +// +// Note, some of the addresses returned MAY be link-local addresses. +// +// Addresses may panic if o does not hold valid IPv6 addresses. +func (o NDPRecursiveDNSServer) Addresses() []tcpip.Address { + l := len(o) + if l < ndpRecursiveDNSServerAddressesOffset { + return nil + } + + l -= ndpRecursiveDNSServerAddressesOffset + if l%IPv6AddressSize != 0 { + return nil + } + + buf := o[ndpRecursiveDNSServerAddressesOffset:] + var addrs []tcpip.Address + for len(buf) > 0 { + addr := tcpip.Address(buf[:IPv6AddressSize]) + if !IsV6UnicastAddress(addr) { + return nil + } + addrs = append(addrs, addr) + buf = buf[IPv6AddressSize:] + } + return addrs +} diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go index ad6daafcd..2c439d70c 100644 --- a/pkg/tcpip/header/ndp_test.go +++ b/pkg/tcpip/header/ndp_test.go @@ -19,6 +19,7 @@ import ( "testing" "time" + "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/tcpip" ) @@ -369,6 +370,175 @@ func TestNDPPrefixInformationOption(t *testing.T) { } } +func TestNDPRecursiveDNSServerOptionSerialize(t *testing.T) { + b := []byte{ + 9, 8, + 1, 2, 4, 8, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + } + targetBuf := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} + expected := []byte{ + 25, 3, 0, 0, + 1, 2, 4, 8, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + } + opts := NDPOptions(targetBuf) + serializer := NDPOptionsSerializer{ + NDPRecursiveDNSServer(b), + } + if got, want := opts.Serialize(serializer), len(expected); got != want { + t.Errorf("got Serialize = %d, want = %d", got, want) + } + if !bytes.Equal(targetBuf, expected) { + t.Fatalf("got targetBuf = %x, want = %x", targetBuf, expected) + } + + it, err := opts.Iter(true) + if err != nil { + t.Fatalf("got Iter = (_, %s), want = (_, nil)", err) + } + + next, done, err := it.Next() + if err != nil { + t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err) + } + if done { + t.Fatal("got Next = (_, true, _), want = (_, false, _)") + } + if got := next.Type(); got != NDPRecursiveDNSServerOptionType { + t.Errorf("got Type = %d, want = %d", got, NDPRecursiveDNSServerOptionType) + } + + opt, ok := next.(NDPRecursiveDNSServer) + if !ok { + t.Fatalf("next (type = %T) cannot be casted to an NDPRecursiveDNSServer", next) + } + if got := opt.Type(); got != 25 { + t.Errorf("got Type = %d, want = 31", got) + } + if got := opt.Length(); got != 22 { + t.Errorf("got Length = %d, want = 22", got) + } + if got, want := opt.Lifetime(), 16909320*time.Second; got != want { + t.Errorf("got Lifetime = %s, want = %s", got, want) + } + want := []tcpip.Address{ + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + } + if got := opt.Addresses(); !cmp.Equal(got, want) { + t.Errorf("got Addresses = %v, want = %v", got, want) + } + + // Iterator should not return anything else. + next, done, err = it.Next() + if err != nil { + t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err) + } + if !done { + t.Error("got Next = (_, false, _), want = (_, true, _)") + } + if next != nil { + t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next) + } +} + +func TestNDPRecursiveDNSServerOption(t *testing.T) { + tests := []struct { + name string + buf []byte + lifetime time.Duration + addrs []tcpip.Address + }{ + { + "Valid1Addr", + []byte{ + 25, 3, 0, 0, + 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + }, + 0, + []tcpip.Address{ + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + }, + }, + { + "Valid2Addr", + []byte{ + 25, 5, 0, 0, + 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, + }, + 0, + []tcpip.Address{ + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + "\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x10", + }, + }, + { + "Valid3Addr", + []byte{ + 25, 7, 0, 0, + 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, + 17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, + }, + 0, + []tcpip.Address{ + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + "\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x10", + "\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x11", + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + opts := NDPOptions(test.buf) + it, err := opts.Iter(true) + if err != nil { + t.Fatalf("got Iter = (_, %s), want = (_, nil)", err) + } + + // Iterator should get our option. + next, done, err := it.Next() + if err != nil { + t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err) + } + if done { + t.Fatal("got Next = (_, true, _), want = (_, false, _)") + } + if got := next.Type(); got != NDPRecursiveDNSServerOptionType { + t.Fatalf("got Type %= %d, want = %d", got, NDPRecursiveDNSServerOptionType) + } + + opt, ok := next.(NDPRecursiveDNSServer) + if !ok { + t.Fatalf("next (type = %T) cannot be casted to an NDPRecursiveDNSServer", next) + } + if got := opt.Lifetime(); got != test.lifetime { + t.Errorf("got Lifetime = %d, want = %d", got, test.lifetime) + } + if got := opt.Addresses(); !cmp.Equal(got, test.addrs) { + t.Errorf("got Addresses = %v, want = %v", got, test.addrs) + } + + // Iterator should not return anything else. + next, done, err = it.Next() + if err != nil { + t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err) + } + if !done { + t.Error("got Next = (_, false, _), want = (_, true, _)") + } + if next != nil { + t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next) + } + }) + } +} + // TestNDPOptionsIterCheck tests that Iter will return false if the NDPOptions // the iterator was returned for is malformed. func TestNDPOptionsIterCheck(t *testing.T) { @@ -473,6 +643,51 @@ func TestNDPOptionsIterCheck(t *testing.T) { }, nil, }, + { + "InvalidRecursiveDNSServerCutsOffAddress", + []byte{ + 25, 4, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 1, 2, 3, 4, 5, 6, 7, + }, + ErrNDPOptMalformedBody, + }, + { + "InvalidRecursiveDNSServerInvalidLengthField", + []byte{ + 25, 2, 0, 0, + 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, + }, + ErrNDPInvalidLength, + }, + { + "RecursiveDNSServerTooSmall", + []byte{ + 25, 1, 0, 0, + 0, 0, 0, + }, + ErrNDPOptBufExhausted, + }, + { + "RecursiveDNSServerMulticast", + []byte{ + 25, 3, 0, 0, + 0, 0, 0, 0, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + }, + ErrNDPOptMalformedBody, + }, + { + "RecursiveDNSServerUnspecified", + []byte{ + 25, 3, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + ErrNDPOptMalformedBody, + }, } for _, test := range tests { diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index da8482509..42cacb8a6 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -79,16 +79,16 @@ func (e *endpoint) MaxHeaderLength() uint16 { func (e *endpoint) Close() {} -func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, stack.PacketLooping, tcpip.PacketBuffer) *tcpip.Error { +func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, tcpip.PacketBuffer) *tcpip.Error { return tcpip.ErrNotSupported } // WritePackets implements stack.NetworkEndpoint.WritePackets. -func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, stack.NetworkHeaderParams, stack.PacketLooping) (int, *tcpip.Error) { +func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, stack.NetworkHeaderParams) (int, *tcpip.Error) { return 0, tcpip.ErrNotSupported } -func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error { +func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error { return tcpip.ErrNotSupported } diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go index 4144a7837..f1bc33adf 100644 --- a/pkg/tcpip/network/ip_test.go +++ b/pkg/tcpip/network/ip_test.go @@ -239,7 +239,7 @@ func TestIPv4Send(t *testing.T) { if err != nil { t.Fatalf("could not find route: %v", err) } - if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketOut, tcpip.PacketBuffer{ + if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{ Header: hdr, Data: payload.ToVectorisedView(), }); err != nil { @@ -480,7 +480,7 @@ func TestIPv6Send(t *testing.T) { if err != nil { t.Fatalf("could not find route: %v", err) } - if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketOut, tcpip.PacketBuffer{ + if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{ Header: hdr, Data: payload.ToVectorisedView(), }); err != nil { diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index e645cf62c..4ee3d5b45 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -238,11 +238,11 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS } // WritePacket writes a packet to the given destination address and protocol. -func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error { +func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error { ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params) pkt.NetworkHeader = buffer.View(ip) - if loop&stack.PacketLoop != 0 { + if r.Loop&stack.PacketLoop != 0 { // The inbound path expects the network header to still be in // the PacketBuffer's Data field. views := make([]buffer.View, 1, 1+len(pkt.Data.Views())) @@ -256,7 +256,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw loopedR.Release() } - if loop&stack.PacketOut == 0 { + if r.Loop&stack.PacketOut == 0 { return nil } if pkt.Header.UsedLength()+pkt.Data.Size() > int(e.linkEP.MTU()) && (gso == nil || gso.Type == stack.GSONone) { @@ -270,11 +270,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw } // WritePackets implements stack.NetworkEndpoint.WritePackets. -func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) { - if loop&stack.PacketLoop != 0 { +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) { + if r.Loop&stack.PacketLoop != 0 { panic("multiple packets in local loop") } - if loop&stack.PacketOut == 0 { + if r.Loop&stack.PacketOut == 0 { return len(pkts), nil } @@ -289,7 +289,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac // WriteHeaderIncludedPacket writes a packet already containing a network // header through the given route. -func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error { +func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error { // The packet already has an IP header, but there are a few required // checks. ip := header.IPv4(pkt.Data.First()) @@ -324,10 +324,10 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLo ip.SetChecksum(0) ip.SetChecksum(^ip.CalculateChecksum()) - if loop&stack.PacketLoop != 0 { + if r.Loop&stack.PacketLoop != 0 { e.HandlePacket(r, pkt.Clone()) } - if loop&stack.PacketOut == 0 { + if r.Loop&stack.PacketOut == 0 { return nil } diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index dd31f0fb7..58c3c79b9 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -112,11 +112,11 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS } // WritePacket writes a packet to the given destination address and protocol. -func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error { +func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error { ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params) pkt.NetworkHeader = buffer.View(ip) - if loop&stack.PacketLoop != 0 { + if r.Loop&stack.PacketLoop != 0 { // The inbound path expects the network header to still be in // the PacketBuffer's Data field. views := make([]buffer.View, 1, 1+len(pkt.Data.Views())) @@ -130,7 +130,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw loopedR.Release() } - if loop&stack.PacketOut == 0 { + if r.Loop&stack.PacketOut == 0 { return nil } @@ -139,11 +139,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw } // WritePackets implements stack.LinkEndpoint.WritePackets. -func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) { - if loop&stack.PacketLoop != 0 { +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) { + if r.Loop&stack.PacketLoop != 0 { panic("not implemented") } - if loop&stack.PacketOut == 0 { + if r.Loop&stack.PacketOut == 0 { return len(pkts), nil } @@ -161,8 +161,8 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet // supported by IPv6. -func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error { - // TODO(b/119580726): Support IPv6 header-included packets. +func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error { + // TODO(b/146666412): Support IPv6 header-included packets. return tcpip.ErrNotSupported } diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD index 4839f0a65..e156b01f6 100644 --- a/pkg/tcpip/ports/BUILD +++ b/pkg/tcpip/ports/BUILD @@ -1,5 +1,5 @@ -load("//tools/go_stateify:defs.bzl", "go_library") load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go index 30cea8996..6c5e19e8f 100644 --- a/pkg/tcpip/ports/ports.go +++ b/pkg/tcpip/ports/ports.go @@ -41,6 +41,30 @@ type portDescriptor struct { port uint16 } +// Flags represents the type of port reservation. +// +// +stateify savable +type Flags struct { + // MostRecent represents UDP SO_REUSEADDR. + MostRecent bool + + // LoadBalanced indicates SO_REUSEPORT. + // + // LoadBalanced takes precidence over MostRecent. + LoadBalanced bool +} + +func (f Flags) bits() reuseFlag { + var rf reuseFlag + if f.MostRecent { + rf |= mostRecentFlag + } + if f.LoadBalanced { + rf |= loadBalancedFlag + } + return rf +} + // PortManager manages allocating, reserving and releasing ports. type PortManager struct { mu sync.RWMutex @@ -54,9 +78,59 @@ type PortManager struct { hint uint32 } +type reuseFlag int + +const ( + mostRecentFlag reuseFlag = 1 << iota + loadBalancedFlag + nextFlag + + flagMask = nextFlag - 1 +) + type portNode struct { - reuse bool - refs int + // refs stores the count for each possible flag combination. + refs [nextFlag]int +} + +func (p portNode) totalRefs() int { + var total int + for _, r := range p.refs { + total += r + } + return total +} + +// flagRefs returns the number of references with all specified flags. +func (p portNode) flagRefs(flags reuseFlag) int { + var total int + for i, r := range p.refs { + if reuseFlag(i)&flags == flags { + total += r + } + } + return total +} + +// allRefsHave returns if all references have all specified flags. +func (p portNode) allRefsHave(flags reuseFlag) bool { + for i, r := range p.refs { + if reuseFlag(i)&flags == flags && r > 0 { + return false + } + } + return true +} + +// intersectionRefs returns the set of flags shared by all references. +func (p portNode) intersectionRefs() reuseFlag { + intersection := flagMask + for i, r := range p.refs { + if r > 0 { + intersection &= reuseFlag(i) + } + } + return intersection } // deviceNode is never empty. When it has no elements, it is removed from the @@ -66,30 +140,44 @@ type deviceNode map[tcpip.NICID]portNode // isAvailable checks whether binding is possible by device. If not binding to a // device, check against all portNodes. If binding to a specific device, check // against the unspecified device and the provided device. -func (d deviceNode) isAvailable(reuse bool, bindToDevice tcpip.NICID) bool { +// +// If either of the port reuse flags is enabled on any of the nodes, all nodes +// sharing a port must share at least one reuse flag. This matches Linux's +// behavior. +func (d deviceNode) isAvailable(flags Flags, bindToDevice tcpip.NICID) bool { + flagBits := flags.bits() if bindToDevice == 0 { // Trying to binding all devices. - if !reuse { + if flagBits == 0 { // Can't bind because the (addr,port) is already bound. return false } + intersection := flagMask for _, p := range d { - if !p.reuse { - // Can't bind because the (addr,port) was previously bound without reuse. + i := p.intersectionRefs() + intersection &= i + if intersection&flagBits == 0 { + // Can't bind because the (addr,port) was + // previously bound without reuse. return false } } return true } + intersection := flagMask + if p, ok := d[0]; ok { - if !reuse || !p.reuse { + intersection = p.intersectionRefs() + if intersection&flagBits == 0 { return false } } if p, ok := d[bindToDevice]; ok { - if !reuse || !p.reuse { + i := p.intersectionRefs() + intersection &= i + if intersection&flagBits == 0 { return false } } @@ -103,12 +191,12 @@ type bindAddresses map[tcpip.Address]deviceNode // isAvailable checks whether an IP address is available to bind to. If the // address is the "any" address, check all other addresses. Otherwise, just // check against the "any" address and the provided address. -func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool, bindToDevice tcpip.NICID) bool { +func (b bindAddresses) isAvailable(addr tcpip.Address, flags Flags, bindToDevice tcpip.NICID) bool { if addr == anyIPAddress { // If binding to the "any" address then check that there are no conflicts // with all addresses. for _, d := range b { - if !d.isAvailable(reuse, bindToDevice) { + if !d.isAvailable(flags, bindToDevice) { return false } } @@ -117,14 +205,14 @@ func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool, bindToDevice // Check that there is no conflict with the "any" address. if d, ok := b[anyIPAddress]; ok { - if !d.isAvailable(reuse, bindToDevice) { + if !d.isAvailable(flags, bindToDevice) { return false } } // Check that this is no conflict with the provided address. if d, ok := b[addr]; ok { - if !d.isAvailable(reuse, bindToDevice) { + if !d.isAvailable(flags, bindToDevice) { return false } } @@ -190,17 +278,17 @@ func (s *PortManager) pickEphemeralPort(offset, count uint32, testPort func(p ui } // IsPortAvailable tests if the given port is available on all given protocols. -func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool { +func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool { s.mu.Lock() defer s.mu.Unlock() - return s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice) + return s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice) } -func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool { +func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool { for _, network := range networks { desc := portDescriptor{network, transport, port} if addrs, ok := s.allocatedPorts[desc]; ok { - if !addrs.isAvailable(addr, reuse, bindToDevice) { + if !addrs.isAvailable(addr, flags, bindToDevice) { return false } } @@ -212,14 +300,14 @@ func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumb // reserved by another endpoint. If port is zero, ReservePort will search for // an unreserved ephemeral port and reserve it, returning its value in the // "port" return value. -func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) { +func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) { s.mu.Lock() defer s.mu.Unlock() // If a port is specified, just try to reserve it for all network // protocols. if port != 0 { - if !s.reserveSpecificPort(networks, transport, addr, port, reuse, bindToDevice) { + if !s.reserveSpecificPort(networks, transport, addr, port, flags, bindToDevice) { return 0, tcpip.ErrPortInUse } return port, nil @@ -227,15 +315,16 @@ func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transp // A port wasn't specified, so try to find one. return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) { - return s.reserveSpecificPort(networks, transport, addr, p, reuse, bindToDevice), nil + return s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice), nil }) } // reserveSpecificPort tries to reserve the given port on all given protocols. -func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool { - if !s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice) { +func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool { + if !s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice) { return false } + flagBits := flags.bits() // Reserve port on all network protocols. for _, network := range networks { @@ -250,12 +339,9 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber d = make(deviceNode) m[addr] = d } - if n, ok := d[bindToDevice]; ok { - n.refs++ - d[bindToDevice] = n - } else { - d[bindToDevice] = portNode{reuse: reuse, refs: 1} - } + n := d[bindToDevice] + n.refs[flagBits]++ + d[bindToDevice] = n } return true @@ -263,10 +349,12 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber // ReleasePort releases the reservation on a port/IP combination so that it can // be reserved by other endpoints. -func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, bindToDevice tcpip.NICID) { +func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) { s.mu.Lock() defer s.mu.Unlock() + flagBits := flags.bits() + for _, network := range networks { desc := portDescriptor{network, transport, port} if m, ok := s.allocatedPorts[desc]; ok { @@ -278,9 +366,9 @@ func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transp if !ok { continue } - n.refs-- + n.refs[flagBits]-- d[bindToDevice] = n - if n.refs == 0 { + if n.refs == [nextFlag]int{} { delete(d, bindToDevice) } if len(d) == 0 { diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go index 19f4833fc..d6969d050 100644 --- a/pkg/tcpip/ports/ports_test.go +++ b/pkg/tcpip/ports/ports_test.go @@ -33,7 +33,7 @@ type portReserveTestAction struct { port uint16 ip tcpip.Address want *tcpip.Error - reuse bool + flags Flags release bool device tcpip.NICID } @@ -50,7 +50,7 @@ func TestPortReservation(t *testing.T) { {port: 80, ip: fakeIPAddress1, want: nil}, /* N.B. Order of tests matters! */ {port: 80, ip: anyIPAddress, want: tcpip.ErrPortInUse}, - {port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true}, + {port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}}, }, }, { @@ -61,7 +61,7 @@ func TestPortReservation(t *testing.T) { /* release fakeIPAddress, but anyIPAddress is still inuse */ {port: 22, ip: fakeIPAddress, release: true}, {port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse}, - {port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true}, + {port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}}, /* Release port 22 from any IP address, then try to reserve fake IP address on 22 */ {port: 22, ip: anyIPAddress, want: nil, release: true}, {port: 22, ip: fakeIPAddress, want: nil}, @@ -71,36 +71,36 @@ func TestPortReservation(t *testing.T) { actions: []portReserveTestAction{ {port: 00, ip: fakeIPAddress, want: nil}, {port: 00, ip: fakeIPAddress, want: nil}, - {port: 00, ip: fakeIPAddress, reuse: true, want: nil}, + {port: 00, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil}, }, }, { tname: "bind to ip with reuseport", actions: []portReserveTestAction{ - {port: 25, ip: fakeIPAddress, reuse: true, want: nil}, - {port: 25, ip: fakeIPAddress, reuse: true, want: nil}, + {port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil}, + {port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil}, - {port: 25, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse}, - {port: 25, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse}, + {port: 25, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse}, + {port: 25, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse}, - {port: 25, ip: anyIPAddress, reuse: true, want: nil}, + {port: 25, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil}, }, }, { tname: "bind to inaddr any with reuseport", actions: []portReserveTestAction{ - {port: 24, ip: anyIPAddress, reuse: true, want: nil}, - {port: 24, ip: anyIPAddress, reuse: true, want: nil}, + {port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil}, + {port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil}, - {port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse}, + {port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse}, + {port: 24, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, reuse: true, want: nil}, - {port: 24, ip: fakeIPAddress, release: true, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, release: true, want: nil}, - {port: 24, ip: anyIPAddress, release: true}, - {port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse}, + {port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true}, + {port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse}, - {port: 24, ip: anyIPAddress, release: true}, - {port: 24, ip: anyIPAddress, reuse: false, want: nil}, + {port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true}, + {port: 24, ip: anyIPAddress, flags: Flags{}, want: nil}, }, }, { tname: "bind twice with device fails", @@ -125,88 +125,152 @@ func TestPortReservation(t *testing.T) { actions: []portReserveTestAction{ {port: 24, ip: fakeIPAddress, want: nil}, {port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse}, {port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse}, + {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse}, }, }, { tname: "bind with device", actions: []portReserveTestAction{ {port: 24, ip: fakeIPAddress, device: 123, want: nil}, {port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse}, {port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil}, + {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse}, + {port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil}, {port: 24, ip: fakeIPAddress, device: 789, want: nil}, {port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse}, + {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse}, }, }, { - tname: "bind with reuse", + tname: "bind with reuseport", actions: []portReserveTestAction{ - {port: 24, ip: fakeIPAddress, reuse: true, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil}, {port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil}, {port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil}, + {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil}, }, }, { - tname: "binding with reuse and device", + tname: "binding with reuseport and device", actions: []portReserveTestAction{ - {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil}, {port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil}, {port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil}, - {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil}, - {port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil}, + {port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil}, {port: 24, ip: fakeIPAddress, device: 999, want: tcpip.ErrPortInUse}, }, }, { - tname: "mixing reuse and not reuse by binding to device", + tname: "mixing reuseport and not reuseport by binding to device", actions: []portReserveTestAction{ - {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil}, {port: 24, ip: fakeIPAddress, device: 456, want: nil}, - {port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil}, + {port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil}, {port: 24, ip: fakeIPAddress, device: 999, want: nil}, }, }, { - tname: "can't bind to 0 after mixing reuse and not reuse", + tname: "can't bind to 0 after mixing reuseport and not reuseport", actions: []portReserveTestAction{ - {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil}, {port: 24, ip: fakeIPAddress, device: 456, want: nil}, - {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse}, + {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse}, }, }, { tname: "bind and release", actions: []portReserveTestAction{ - {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil}, - {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil}, - {port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: tcpip.ErrPortInUse}, - {port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: tcpip.ErrPortInUse}, + {port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil}, // Release the bind to device 0 and try again. - {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil, release: true}, - {port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil}, + {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil, release: true}, + {port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil}, }, }, { - tname: "bind twice with reuse once", + tname: "bind twice with reuseport once", actions: []portReserveTestAction{ - {port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil}, - {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil}, + {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse}, }, }, { tname: "release an unreserved device", actions: []portReserveTestAction{ - {port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil}, - {port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil}, + {port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil}, // The below don't exist. - {port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil, release: true}, - {port: 9999, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true}, + {port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil, release: true}, + {port: 9999, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true}, // Release all. - {port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true}, - {port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil, release: true}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true}, + {port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil, release: true}, + }, + }, { + tname: "bind with reuseaddr", + actions: []portReserveTestAction{ + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil}, + {port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse}, + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{MostRecent: true}, want: nil}, + {port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse}, + {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: nil}, + }, + }, { + tname: "bind twice with reuseaddr once", + actions: []portReserveTestAction{ + {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil}, + {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse}, + }, + }, { + tname: "bind with reuseaddr and reuseport", + actions: []portReserveTestAction{ + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil}, + }, + }, { + tname: "bind with reuseaddr and reuseport, and then reuseaddr", + actions: []portReserveTestAction{ + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse}, + }, + }, { + tname: "bind with reuseaddr and reuseport, and then reuseport", + actions: []portReserveTestAction{ + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse}, + }, + }, { + tname: "bind with reuseaddr and reuseport twice, and then reuseaddr", + actions: []portReserveTestAction{ + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil}, + }, + }, { + tname: "bind with reuseaddr and reuseport twice, and then reuseport", + actions: []portReserveTestAction{ + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil}, + }, + }, { + tname: "bind with reuseaddr, and then reuseaddr and reuseport", + actions: []portReserveTestAction{ + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse}, + }, + }, { + tname: "bind with reuseport, and then reuseaddr and reuseport", + actions: []portReserveTestAction{ + {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil}, + {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse}, }, }, } { @@ -216,12 +280,12 @@ func TestPortReservation(t *testing.T) { for _, test := range test.actions { if test.release { - pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.device) + pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device) continue } - gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.reuse, test.device) + gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device) if err != test.want { - t.Fatalf("ReservePort(.., .., %s, %d, %t, %d) = %v, want %v", test.ip, test.port, test.reuse, test.device, err, test.want) + t.Fatalf("ReservePort(.., .., %s, %d, %+v, %d) = %v, want %v", test.ip, test.port, test.flags, test.device, err, test.want) } if test.port == 0 && (gotPort == 0 || gotPort < FirstEphemeral) { t.Fatalf("ReservePort(.., .., .., 0) = %d, want port number >= %d to be picked", gotPort, FirstEphemeral) diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD index a57752a7c..d7496fde6 100644 --- a/pkg/tcpip/sample/tun_tcp_connect/BUILD +++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD @@ -5,6 +5,7 @@ package(licenses = ["notice"]) go_binary( name = "tun_tcp_connect", srcs = ["main.go"], + visibility = ["//:sandbox"], deps = [ "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD index dad8ef399..875561566 100644 --- a/pkg/tcpip/sample/tun_tcp_echo/BUILD +++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD @@ -5,6 +5,7 @@ package(licenses = ["notice"]) go_binary( name = "tun_tcp_echo", srcs = ["main.go"], + visibility = ["//:sandbox"], deps = [ "//pkg/tcpip", "//pkg/tcpip/link/fdbased", diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD index 69077669a..b8f9517d0 100644 --- a/pkg/tcpip/stack/BUILD +++ b/pkg/tcpip/stack/BUILD @@ -59,6 +59,7 @@ go_test( ], deps = [ ":stack", + "//pkg/rand", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/checker", diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go index cfdd0496e..4722ec9ce 100644 --- a/pkg/tcpip/stack/ndp.go +++ b/pkg/tcpip/stack/ndp.go @@ -58,6 +58,14 @@ const ( // Default = true. defaultDiscoverOnLinkPrefixes = true + // defaultAutoGenGlobalAddresses is the default configuration for + // whether or not to generate global IPv6 addresses in response to + // receiving a new Prefix Information option with its Autonomous + // Address AutoConfiguration flag set, as a host. + // + // Default = true. + defaultAutoGenGlobalAddresses = true + // minimumRetransmitTimer is the minimum amount of time to wait between // sending NDP Neighbor solicitation messages. Note, RFC 4861 does // not impose a minimum Retransmit Timer, but we do here to make sure @@ -87,6 +95,24 @@ const ( // // Max = 10. MaxDiscoveredOnLinkPrefixes = 10 + + // validPrefixLenForAutoGen is the expected prefix length that an + // address can be generated for. Must be 64 bits as the interface + // identifier (IID) is 64 bits and an IPv6 address is 128 bits, so + // 128 - 64 = 64. + validPrefixLenForAutoGen = 64 +) + +var ( + // MinPrefixInformationValidLifetimeForUpdate is the minimum Valid + // Lifetime to update the valid lifetime of a generated address by + // SLAAC. + // + // This is exported as a variable (instead of a constant) so tests + // can update it to a smaller value. + // + // Min = 2hrs. + MinPrefixInformationValidLifetimeForUpdate = 2 * time.Hour ) // NDPDispatcher is the interface integrators of netstack must implement to @@ -105,40 +131,70 @@ type NDPDispatcher interface { OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) // OnDefaultRouterDiscovered will be called when a new default router is - // discovered. Implementations must return true along with a new valid - // route table if the newly discovered router should be remembered. If - // an implementation returns false, the second return value will be - // ignored. + // discovered. Implementations must return true if the newly discovered + // router should be remembered. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. - OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route) + OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool // OnDefaultRouterInvalidated will be called when a discovered default - // router is invalidated. Implementers must return a new valid route - // table. + // router that was remembered is invalidated. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. - OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) []tcpip.Route + OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) // OnOnLinkPrefixDiscovered will be called when a new on-link prefix is - // discovered. Implementations must return true along with a new valid - // route table if the newly discovered on-link prefix should be - // remembered. If an implementation returns false, the second return - // value will be ignored. + // discovered. Implementations must return true if the newly discovered + // on-link prefix should be remembered. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. - OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) (bool, []tcpip.Route) + OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool // OnOnLinkPrefixInvalidated will be called when a discovered on-link - // prefix is invalidated. Implementers must return a new valid route - // table. + // prefix that was remembered is invalidated. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. - OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) []tcpip.Route + OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) + + // OnAutoGenAddress will be called when a new prefix with its + // autonomous address-configuration flag set has been received and SLAAC + // has been performed. Implementations may prevent the stack from + // assigning the address to the NIC by returning false. + // + // This function is not permitted to block indefinitely. It must not + // call functions on the stack itself. + OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool + + // OnAutoGenAddressDeprecated will be called when an auto-generated + // address (as part of SLAAC) has been deprecated, but is still + // considered valid. Note, if an address is invalidated at the same + // time it is deprecated, the deprecation event MAY be omitted. + // + // This function is not permitted to block indefinitely. It must not + // call functions on the stack itself. + OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix) + + // OnAutoGenAddressInvalidated will be called when an auto-generated + // address (as part of SLAAC) has been invalidated. + // + // This function is not permitted to block indefinitely. It must not + // call functions on the stack itself. + OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix) + + // OnRecursiveDNSServerOption will be called when an NDP option with + // recursive DNS servers has been received. Note, addrs may contain + // link-local addresses. + // + // It is up to the caller to use the DNS Servers only for their valid + // lifetime. OnRecursiveDNSServerOption may be called for new or + // already known DNS servers. If called with known DNS servers, their + // valid lifetimes must be refreshed to lifetime (it may be increased, + // decreased, or completely invalidated when lifetime = 0). + OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration) } // NDPConfigurations is the NDP configurations for the netstack. @@ -168,6 +224,17 @@ type NDPConfigurations struct { // will be discovered from Router Advertisements' Prefix Information // option. This configuration is ignored if HandleRAs is false. DiscoverOnLinkPrefixes bool + + // AutoGenGlobalAddresses determines whether or not global IPv6 + // addresses will be generated for a NIC in response to receiving a new + // Prefix Information option with its Autonomous Address + // AutoConfiguration flag set, as a host, as per RFC 4862 (SLAAC). + // + // Note, if an address was already generated for some unique prefix, as + // part of SLAAC, this option does not affect whether or not the + // lifetime(s) of the generated address changes; this option only + // affects the generation of new addresses as part of SLAAC. + AutoGenGlobalAddresses bool } // DefaultNDPConfigurations returns an NDPConfigurations populated with @@ -179,6 +246,7 @@ func DefaultNDPConfigurations() NDPConfigurations { HandleRAs: defaultHandleRAs, DiscoverDefaultRouters: defaultDiscoverDefaultRouters, DiscoverOnLinkPrefixes: defaultDiscoverOnLinkPrefixes, + AutoGenGlobalAddresses: defaultAutoGenGlobalAddresses, } } @@ -210,6 +278,9 @@ type ndpState struct { // The on-link prefixes discovered through Router Advertisements' Prefix // Information option. onLinkPrefixes map[tcpip.Subnet]onLinkPrefixState + + // The addresses generated by SLAAC. + autoGenAddresses map[tcpip.Address]autoGenAddressState } // dadState holds the Duplicate Address Detection timer and channel to signal @@ -270,6 +341,43 @@ type onLinkPrefixState struct { doNotInvalidate *bool } +// autoGenAddressState holds data associated with an address generated via +// SLAAC. +type autoGenAddressState struct { + // A reference to the referencedNetworkEndpoint that this autoGenAddressState + // is holding state for. + ref *referencedNetworkEndpoint + + deprecationTimer *time.Timer + + // Used to signal the timer not to deprecate the SLAAC address in a race + // condition. Used for the same reason as doNotInvalidate, but for deprecating + // an address. + doNotDeprecate *bool + + invalidationTimer *time.Timer + + // Used to signal the timer not to invalidate the SLAAC address (A) in + // a race condition (T1 is a goroutine that handles a PI for A and T2 + // is the goroutine that handles A's invalidation timer firing): + // T1: Receive a new PI for A + // T1: Obtain the NIC's lock before processing the PI + // T2: A's invalidation timer fires, and gets blocked on obtaining the + // NIC's lock + // T1: Refreshes/extends A's lifetime & releases NIC's lock + // T2: Obtains NIC's lock & invalidates A immediately + // + // To resolve this, T1 will check to see if the timer already fired, and + // inform the timer using doNotInvalidate to not invalidate A, so that + // once T2 obtains the lock, it will see that it is set to true and do + // nothing further. + doNotInvalidate *bool + + // Nonzero only when the address is not valid forever (invalidationTimer + // is not nil). + validUntil time.Time +} + // startDuplicateAddressDetection performs Duplicate Address Detection. // // This function must only be called by IPv6 addresses that are currently @@ -534,19 +642,21 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) { // we do not check the iterator for errors on calls to Next. it, _ := ra.Options().Iter(false) for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() { - switch opt.Type() { - case header.NDPPrefixInformationType: - if !ndp.configs.DiscoverOnLinkPrefixes { + switch opt := opt.(type) { + case header.NDPRecursiveDNSServer: + if ndp.nic.stack.ndpDisp == nil { continue } - pi := opt.(header.NDPPrefixInformation) + ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), opt.Addresses(), opt.Lifetime()) - prefix := pi.Subnet() + case header.NDPPrefixInformation: + prefix := opt.Subnet() // Is the prefix a link-local? if header.IsV6LinkLocalAddress(prefix.ID()) { - // ...Yes, skip as per RFC 4861 section 6.3.4. + // ...Yes, skip as per RFC 4861 section 6.3.4, + // and RFC 4862 section 5.5.3.b (for SLAAC). continue } @@ -557,82 +667,13 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) { continue } - if !pi.OnLinkFlag() { - // Not on-link so don't "discover" it as an - // on-link prefix. - continue - } - - prefixState, ok := ndp.onLinkPrefixes[prefix] - vl := pi.ValidLifetime() - switch { - case !ok && vl == 0: - // Don't know about this prefix but has a zero - // valid lifetime, so just ignore. - continue - - case !ok && vl != 0: - // This is a new on-link prefix we are - // discovering. - // - // Only remember it if we currently know about - // less than MaxDiscoveredOnLinkPrefixes on-link - // prefixes. - if len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes { - ndp.rememberOnLinkPrefix(prefix, vl) - } - continue - - case ok && vl == 0: - // We know about the on-link prefix, but it is - // no longer to be considered on-link, so - // invalidate it. - ndp.invalidateOnLinkPrefix(prefix) - continue + if opt.OnLinkFlag() { + ndp.handleOnLinkPrefixInformation(opt) } - // This is an already discovered on-link prefix with a - // new non-zero valid lifetime. - // Update the invalidation timer. - timer := prefixState.invalidationTimer - - if timer == nil && vl >= header.NDPPrefixInformationInfiniteLifetime { - // Had infinite valid lifetime before and - // continues to have an invalid lifetime. Do - // nothing further. - continue + if opt.AutonomousAddressConfigurationFlag() { + ndp.handleAutonomousPrefixInformation(opt) } - - if timer != nil && !timer.Stop() { - // If we reach this point, then we know the - // timer already fired after we took the NIC - // lock. Inform the timer to not invalidate - // the prefix once it obtains the lock as we - // just got a new PI that refeshes its lifetime - // to a non-zero value. See - // onLinkPrefixState.doNotInvalidate for more - // details. - *prefixState.doNotInvalidate = true - } - - if vl >= header.NDPPrefixInformationInfiniteLifetime { - // Prefix is now valid forever so we don't need - // an invalidation timer. - prefixState.invalidationTimer = nil - ndp.onLinkPrefixes[prefix] = prefixState - continue - } - - if timer != nil { - // We already have a timer so just reset it to - // expire after the new valid lifetime. - timer.Reset(vl) - continue - } - - // We do not have a timer so just create a new one. - prefixState.invalidationTimer = ndp.prefixInvalidationCallback(prefix, vl, prefixState.doNotInvalidate) - ndp.onLinkPrefixes[prefix] = prefixState } // TODO(b/141556115): Do (MTU) Parameter Discovery. @@ -641,7 +682,7 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) { // invalidateDefaultRouter invalidates a discovered default router. // -// The NIC that ndp belongs to and its associated stack MUST be locked. +// The NIC that ndp belongs to MUST be locked. func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) { rtr, ok := ndp.defaultRouters[ip] @@ -659,8 +700,8 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) { delete(ndp.defaultRouters, ip) // Let the integrator know a discovered default router is invalidated. - if ndp.nic.stack.ndpDisp != nil { - ndp.nic.stack.routeTable = ndp.nic.stack.ndpDisp.OnDefaultRouterInvalidated(ndp.nic.ID(), ip) + if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil { + ndpDisp.OnDefaultRouterInvalidated(ndp.nic.ID(), ip) } } @@ -669,15 +710,15 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) { // // The router identified by ip MUST NOT already be known by the NIC. // -// The NIC that ndp belongs to and its associated stack MUST be locked. +// The NIC that ndp belongs to MUST be locked. func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) { - if ndp.nic.stack.ndpDisp == nil { + ndpDisp := ndp.nic.stack.ndpDisp + if ndpDisp == nil { return } // Inform the integrator when we discovered a default router. - remember, routeTable := ndp.nic.stack.ndpDisp.OnDefaultRouterDiscovered(ndp.nic.ID(), ip) - if !remember { + if !ndpDisp.OnDefaultRouterDiscovered(ndp.nic.ID(), ip) { // Informed by the integrator to not remember the router, do // nothing further. return @@ -704,8 +745,6 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) { }), doNotInvalidate: &doNotInvalidate, } - - ndp.nic.stack.routeTable = routeTable } // rememberOnLinkPrefix remembers a newly discovered on-link prefix with IPv6 @@ -713,15 +752,15 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) { // // The prefix identified by prefix MUST NOT already be known. // -// The NIC that ndp belongs to and its associated stack MUST be locked. +// The NIC that ndp belongs to MUST be locked. func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) { - if ndp.nic.stack.ndpDisp == nil { + ndpDisp := ndp.nic.stack.ndpDisp + if ndpDisp == nil { return } // Inform the integrator when we discovered an on-link prefix. - remember, routeTable := ndp.nic.stack.ndpDisp.OnOnLinkPrefixDiscovered(ndp.nic.ID(), prefix) - if !remember { + if !ndpDisp.OnOnLinkPrefixDiscovered(ndp.nic.ID(), prefix) { // Informed by the integrator to not remember the prefix, do // nothing further. return @@ -734,7 +773,7 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) var timer *time.Timer // Only create a timer if the lifetime is not infinite. - if l < header.NDPPrefixInformationInfiniteLifetime { + if l < header.NDPInfiniteLifetime { timer = ndp.prefixInvalidationCallback(prefix, l, &doNotInvalidate) } @@ -742,13 +781,11 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) invalidationTimer: timer, doNotInvalidate: &doNotInvalidate, } - - ndp.nic.stack.routeTable = routeTable } // invalidateOnLinkPrefix invalidates a discovered on-link prefix. // -// The NIC that ndp belongs to and its associated stack MUST be locked. +// The NIC that ndp belongs to MUST be locked. func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) { s, ok := ndp.onLinkPrefixes[prefix] @@ -769,8 +806,8 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) { delete(ndp.onLinkPrefixes, prefix) // Let the integrator know a discovered on-link prefix is invalidated. - if ndp.nic.stack.ndpDisp != nil { - ndp.nic.stack.routeTable = ndp.nic.stack.ndpDisp.OnOnLinkPrefixInvalidated(ndp.nic.ID(), prefix) + if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil { + ndpDisp.OnOnLinkPrefixInvalidated(ndp.nic.ID(), prefix) } } @@ -795,3 +832,466 @@ func (ndp *ndpState) prefixInvalidationCallback(prefix tcpip.Subnet, vl time.Dur ndp.invalidateOnLinkPrefix(prefix) }) } + +// handleOnLinkPrefixInformation handles a Prefix Information option with +// its on-link flag set, as per RFC 4861 section 6.3.4. +// +// handleOnLinkPrefixInformation assumes that the prefix this pi is for is +// not the link-local prefix and the on-link flag is set. +// +// The NIC that ndp belongs to MUST be locked. +func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformation) { + prefix := pi.Subnet() + prefixState, ok := ndp.onLinkPrefixes[prefix] + vl := pi.ValidLifetime() + + if !ok && vl == 0 { + // Don't know about this prefix but it has a zero valid + // lifetime, so just ignore. + return + } + + if !ok && vl != 0 { + // This is a new on-link prefix we are discovering + // + // Only remember it if we currently know about less than + // MaxDiscoveredOnLinkPrefixes on-link prefixes. + if ndp.configs.DiscoverOnLinkPrefixes && len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes { + ndp.rememberOnLinkPrefix(prefix, vl) + } + return + } + + if ok && vl == 0 { + // We know about the on-link prefix, but it is + // no longer to be considered on-link, so + // invalidate it. + ndp.invalidateOnLinkPrefix(prefix) + return + } + + // This is an already discovered on-link prefix with a + // new non-zero valid lifetime. + // Update the invalidation timer. + timer := prefixState.invalidationTimer + + if timer == nil && vl >= header.NDPInfiniteLifetime { + // Had infinite valid lifetime before and + // continues to have an invalid lifetime. Do + // nothing further. + return + } + + if timer != nil && !timer.Stop() { + // If we reach this point, then we know the timer alread fired + // after we took the NIC lock. Inform the timer to not + // invalidate the prefix once it obtains the lock as we just + // got a new PI that refreshes its lifetime to a non-zero value. + // See onLinkPrefixState.doNotInvalidate for more details. + *prefixState.doNotInvalidate = true + } + + if vl >= header.NDPInfiniteLifetime { + // Prefix is now valid forever so we don't need + // an invalidation timer. + prefixState.invalidationTimer = nil + ndp.onLinkPrefixes[prefix] = prefixState + return + } + + if timer != nil { + // We already have a timer so just reset it to + // expire after the new valid lifetime. + timer.Reset(vl) + return + } + + // We do not have a timer so just create a new one. + prefixState.invalidationTimer = ndp.prefixInvalidationCallback(prefix, vl, prefixState.doNotInvalidate) + ndp.onLinkPrefixes[prefix] = prefixState +} + +// handleAutonomousPrefixInformation handles a Prefix Information option with +// its autonomous flag set, as per RFC 4862 section 5.5.3. +// +// handleAutonomousPrefixInformation assumes that the prefix this pi is for is +// not the link-local prefix and the autonomous flag is set. +// +// The NIC that ndp belongs to and its associated stack MUST be locked. +func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInformation) { + vl := pi.ValidLifetime() + pl := pi.PreferredLifetime() + + // If the preferred lifetime is greater than the valid lifetime, + // silently ignore the Prefix Information option, as per RFC 4862 + // section 5.5.3.c. + if pl > vl { + return + } + + prefix := pi.Subnet() + + // Check if we already have an auto-generated address for prefix. + for addr, addrState := range ndp.autoGenAddresses { + refAddrWithPrefix := tcpip.AddressWithPrefix{Address: addr, PrefixLen: addrState.ref.ep.PrefixLen()} + if refAddrWithPrefix.Subnet() != prefix { + continue + } + + // At this point, we know we are refreshing a SLAAC generated IPv6 address + // with the prefix prefix. Do the work as outlined by RFC 4862 section + // 5.5.3.e. + ndp.refreshAutoGenAddressLifetimes(addr, pl, vl) + return + } + + // We do not already have an address within the prefix, prefix. Do the + // work as outlined by RFC 4862 section 5.5.3.d if n is configured + // to auto-generated global addresses by SLAAC. + ndp.newAutoGenAddress(prefix, pl, vl) +} + +// newAutoGenAddress generates a new SLAAC address with the provided lifetimes +// for prefix. +// +// pl is the new preferred lifetime. vl is the new valid lifetime. +func (ndp *ndpState) newAutoGenAddress(prefix tcpip.Subnet, pl, vl time.Duration) { + // Are we configured to auto-generate new global addresses? + if !ndp.configs.AutoGenGlobalAddresses { + return + } + + // If we do not already have an address for this prefix and the valid + // lifetime is 0, no need to do anything further, as per RFC 4862 + // section 5.5.3.d. + if vl == 0 { + return + } + + // Make sure the prefix is valid (as far as its length is concerned) to + // generate a valid IPv6 address from an interface identifier (IID), as + // per RFC 4862 sectiion 5.5.3.d. + if prefix.Prefix() != validPrefixLenForAutoGen { + return + } + + addrBytes := []byte(prefix.ID()) + if oIID := ndp.nic.stack.opaqueIIDOpts; oIID.NICNameFromID != nil { + addrBytes = header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], prefix, oIID.NICNameFromID(ndp.nic.ID(), ndp.nic.name), 0 /* dadCounter */, oIID.SecretKey) + } else { + // Only attempt to generate an interface-specific IID if we have a valid + // link address. + // + // TODO(b/141011931): Validate a LinkEndpoint's link address (provided by + // LinkEndpoint.LinkAddress) before reaching this point. + linkAddr := ndp.nic.linkEP.LinkAddress() + if !header.IsValidUnicastEthernetAddress(linkAddr) { + return + } + + // Generate an address within prefix from the modified EUI-64 of ndp's NIC's + // Ethernet MAC address. + header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:]) + } + addr := tcpip.Address(addrBytes) + addrWithPrefix := tcpip.AddressWithPrefix{ + Address: addr, + PrefixLen: validPrefixLenForAutoGen, + } + + // If the nic already has this address, do nothing further. + if ndp.nic.hasPermanentAddrLocked(addr) { + return + } + + // Inform the integrator that we have a new SLAAC address. + ndpDisp := ndp.nic.stack.ndpDisp + if ndpDisp == nil { + return + } + if !ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addrWithPrefix) { + // Informed by the integrator not to add the address. + return + } + + protocolAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: addrWithPrefix, + } + // If the preferred lifetime is zero, then the address should be considered + // deprecated. + deprecated := pl == 0 + ref, err := ndp.nic.addAddressLocked(protocolAddr, FirstPrimaryEndpoint, permanent, slaac, deprecated) + if err != nil { + log.Fatalf("ndp: error when adding address %s: %s", protocolAddr, err) + } + + // Setup the timers to deprecate and invalidate this newly generated address. + + var doNotDeprecate bool + var pTimer *time.Timer + if !deprecated && pl < header.NDPInfiniteLifetime { + pTimer = ndp.autoGenAddrDeprecationTimer(addr, pl, &doNotDeprecate) + } + + var doNotInvalidate bool + var vTimer *time.Timer + if vl < header.NDPInfiniteLifetime { + vTimer = ndp.autoGenAddrInvalidationTimer(addr, vl, &doNotInvalidate) + } + + ndp.autoGenAddresses[addr] = autoGenAddressState{ + ref: ref, + deprecationTimer: pTimer, + doNotDeprecate: &doNotDeprecate, + invalidationTimer: vTimer, + doNotInvalidate: &doNotInvalidate, + validUntil: time.Now().Add(vl), + } +} + +// refreshAutoGenAddressLifetimes refreshes the lifetime of a SLAAC generated +// address addr. +// +// pl is the new preferred lifetime. vl is the new valid lifetime. +func (ndp *ndpState) refreshAutoGenAddressLifetimes(addr tcpip.Address, pl, vl time.Duration) { + addrState, ok := ndp.autoGenAddresses[addr] + if !ok { + log.Fatalf("ndp: SLAAC state not found to refresh lifetimes for %s", addr) + } + defer func() { ndp.autoGenAddresses[addr] = addrState }() + + // If the preferred lifetime is zero, then the address should be considered + // deprecated. + deprecated := pl == 0 + wasDeprecated := addrState.ref.deprecated + addrState.ref.deprecated = deprecated + + // Only send the deprecation event if the deprecated status for addr just + // changed from non-deprecated to deprecated. + if !wasDeprecated && deprecated { + ndp.notifyAutoGenAddressDeprecated(addr) + } + + // If addr was preferred for some finite lifetime before, stop the deprecation + // timer so it can be reset. + if t := addrState.deprecationTimer; t != nil && !t.Stop() { + *addrState.doNotDeprecate = true + } + + // Reset the deprecation timer. + if pl >= header.NDPInfiniteLifetime || deprecated { + // If addr is preferred forever or it has been deprecated already, there is + // no need for a deprecation timer. + addrState.deprecationTimer = nil + } else if addrState.deprecationTimer == nil { + // addr is now preferred for a finite lifetime. + addrState.deprecationTimer = ndp.autoGenAddrDeprecationTimer(addr, pl, addrState.doNotDeprecate) + } else { + // addr continues to be preferred for a finite lifetime. + addrState.deprecationTimer.Reset(pl) + } + + // As per RFC 4862 section 5.5.3.e, the valid lifetime of the address + // + // + // 1) If the received Valid Lifetime is greater than 2 hours or greater than + // RemainingLifetime, set the valid lifetime of the address to the + // advertised Valid Lifetime. + // + // 2) If RemainingLifetime is less than or equal to 2 hours, ignore the + // advertised Valid Lifetime. + // + // 3) Otherwise, reset the valid lifetime of the address to 2 hours. + + // Handle the infinite valid lifetime separately as we do not keep a timer in + // this case. + if vl >= header.NDPInfiniteLifetime { + if addrState.invalidationTimer != nil { + // Valid lifetime was finite before, but now it is valid forever. + if !addrState.invalidationTimer.Stop() { + *addrState.doNotInvalidate = true + } + addrState.invalidationTimer = nil + addrState.validUntil = time.Time{} + } + + return + } + + var effectiveVl time.Duration + var rl time.Duration + + // If the address was originally set to be valid forever, assume the remaining + // time to be the maximum possible value. + if addrState.invalidationTimer == nil { + rl = header.NDPInfiniteLifetime + } else { + rl = time.Until(addrState.validUntil) + } + + if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl { + effectiveVl = vl + } else if rl <= MinPrefixInformationValidLifetimeForUpdate { + return + } else { + effectiveVl = MinPrefixInformationValidLifetimeForUpdate + } + + if addrState.invalidationTimer == nil { + addrState.invalidationTimer = ndp.autoGenAddrInvalidationTimer(addr, effectiveVl, addrState.doNotInvalidate) + } else { + if !addrState.invalidationTimer.Stop() { + *addrState.doNotInvalidate = true + } + addrState.invalidationTimer.Reset(effectiveVl) + } + + addrState.validUntil = time.Now().Add(effectiveVl) +} + +// notifyAutoGenAddressDeprecated notifies the stack's NDP dispatcher that addr +// has been deprecated. +func (ndp *ndpState) notifyAutoGenAddressDeprecated(addr tcpip.Address) { + if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil { + ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), tcpip.AddressWithPrefix{ + Address: addr, + PrefixLen: validPrefixLenForAutoGen, + }) + } +} + +// invalidateAutoGenAddress invalidates an auto-generated address. +// +// The NIC that ndp belongs to MUST be locked. +func (ndp *ndpState) invalidateAutoGenAddress(addr tcpip.Address) { + if !ndp.cleanupAutoGenAddrResourcesAndNotify(addr) { + return + } + + ndp.nic.removePermanentAddressLocked(addr) +} + +// cleanupAutoGenAddrResourcesAndNotify cleans up an invalidated auto-generated +// address's resources from ndp. If the stack has an NDP dispatcher, it will +// be notified that addr has been invalidated. +// +// Returns true if ndp had resources for addr to cleanup. +// +// The NIC that ndp belongs to MUST be locked. +func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bool { + state, ok := ndp.autoGenAddresses[addr] + + if !ok { + return false + } + + if state.deprecationTimer != nil { + state.deprecationTimer.Stop() + state.deprecationTimer = nil + *state.doNotDeprecate = true + } + + state.doNotDeprecate = nil + + if state.invalidationTimer != nil { + state.invalidationTimer.Stop() + state.invalidationTimer = nil + *state.doNotInvalidate = true + } + + state.doNotInvalidate = nil + + delete(ndp.autoGenAddresses, addr) + + if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil { + ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), tcpip.AddressWithPrefix{ + Address: addr, + PrefixLen: validPrefixLenForAutoGen, + }) + } + + return true +} + +// autoGenAddrDeprecationTimer returns a new deprecation timer for an +// auto-generated address that fires after pl. +// +// doNotDeprecate is used to inform the timer when it fires at the same time +// that an auto-generated address's preferred lifetime gets refreshed. See +// autoGenAddrState.doNotDeprecate for more details. +func (ndp *ndpState) autoGenAddrDeprecationTimer(addr tcpip.Address, pl time.Duration, doNotDeprecate *bool) *time.Timer { + return time.AfterFunc(pl, func() { + ndp.nic.mu.Lock() + defer ndp.nic.mu.Unlock() + + if *doNotDeprecate { + *doNotDeprecate = false + return + } + + addrState, ok := ndp.autoGenAddresses[addr] + if !ok { + log.Fatalf("ndp: must have an autoGenAddressess entry for the SLAAC generated IPv6 address %s", addr) + } + addrState.ref.deprecated = true + ndp.notifyAutoGenAddressDeprecated(addr) + addrState.deprecationTimer = nil + ndp.autoGenAddresses[addr] = addrState + }) +} + +// autoGenAddrInvalidationTimer returns a new invalidation timer for an +// auto-generated address that fires after vl. +// +// doNotInvalidate is used to inform the timer when it fires at the same time +// that an auto-generated address's valid lifetime gets refreshed. See +// autoGenAddrState.doNotInvalidate for more details. +func (ndp *ndpState) autoGenAddrInvalidationTimer(addr tcpip.Address, vl time.Duration, doNotInvalidate *bool) *time.Timer { + return time.AfterFunc(vl, func() { + ndp.nic.mu.Lock() + defer ndp.nic.mu.Unlock() + + if *doNotInvalidate { + *doNotInvalidate = false + return + } + + ndp.invalidateAutoGenAddress(addr) + }) +} + +// cleanupHostOnlyState cleans up any state that is only useful for hosts. +// +// cleanupHostOnlyState MUST be called when ndp's NIC is transitioning from a +// host to a router. This function will invalidate all discovered on-link +// prefixes, discovered routers, and auto-generated addresses as routers do not +// normally process Router Advertisements to discover default routers and +// on-link prefixes, and auto-generate addresses via SLAAC. +// +// The NIC that ndp belongs to MUST be locked. +func (ndp *ndpState) cleanupHostOnlyState() { + for addr, _ := range ndp.autoGenAddresses { + ndp.invalidateAutoGenAddress(addr) + } + + if got := len(ndp.autoGenAddresses); got != 0 { + log.Fatalf("ndp: still have auto-generated addresses after cleaning up, found = %d", got) + } + + for prefix, _ := range ndp.onLinkPrefixes { + ndp.invalidateOnLinkPrefix(prefix) + } + + if got := len(ndp.onLinkPrefixes); got != 0 { + log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up, found = %d", got) + } + + for router, _ := range ndp.defaultRouters { + ndp.invalidateDefaultRouter(router) + } + + if got := len(ndp.defaultRouters); got != 0 { + log.Fatalf("ndp: still have discovered default routers after cleaning up, found = %d", got) + } +} diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 5b901f947..8d89859ba 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -21,6 +21,7 @@ import ( "time" "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/checker" @@ -29,6 +30,8 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" + "gvisor.dev/gvisor/pkg/waiter" ) const ( @@ -38,15 +41,47 @@ const ( linkAddr1 = "\x02\x02\x03\x04\x05\x06" linkAddr2 = "\x02\x02\x03\x04\x05\x07" linkAddr3 = "\x02\x02\x03\x04\x05\x08" - defaultTimeout = 250 * time.Millisecond + defaultTimeout = 100 * time.Millisecond ) var ( llAddr1 = header.LinkLocalAddr(linkAddr1) llAddr2 = header.LinkLocalAddr(linkAddr2) llAddr3 = header.LinkLocalAddr(linkAddr3) + dstAddr = tcpip.FullAddress{ + Addr: "\x0a\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01", + Port: 25, + } ) +func addrForSubnet(subnet tcpip.Subnet, linkAddr tcpip.LinkAddress) tcpip.AddressWithPrefix { + if !header.IsValidUnicastEthernetAddress(linkAddr) { + return tcpip.AddressWithPrefix{} + } + + addrBytes := []byte(subnet.ID()) + header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:]) + return tcpip.AddressWithPrefix{ + Address: tcpip.Address(addrBytes), + PrefixLen: 64, + } +} + +// prefixSubnetAddr returns a prefix (Address + Length), the prefix's equivalent +// tcpip.Subnet, and an address where the lower half of the address is composed +// of the EUI-64 of linkAddr if it is a valid unicast ethernet address. +func prefixSubnetAddr(offset uint8, linkAddr tcpip.LinkAddress) (tcpip.AddressWithPrefix, tcpip.Subnet, tcpip.AddressWithPrefix) { + prefixBytes := []byte{1, 2, 3, 4, 5, 6, 7, 8 + offset, 0, 0, 0, 0, 0, 0, 0, 0} + prefix := tcpip.AddressWithPrefix{ + Address: tcpip.Address(prefixBytes), + PrefixLen: 64, + } + + subnet := prefix.Subnet() + + return prefix, subnet, addrForSubnet(subnet, linkAddr) +} + // TestDADDisabled tests that an address successfully resolves immediately // when DAD is not enabled (the default for an empty stack.Options). func TestDADDisabled(t *testing.T) { @@ -54,7 +89,7 @@ func TestDADDisabled(t *testing.T) { NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(opts) if err := s.CreateNIC(1, e); err != nil { t.Fatalf("CreateNIC(_) = %s", err) @@ -103,6 +138,30 @@ type ndpPrefixEvent struct { discovered bool } +type ndpAutoGenAddrEventType int + +const ( + newAddr ndpAutoGenAddrEventType = iota + deprecatedAddr + invalidatedAddr +) + +type ndpAutoGenAddrEvent struct { + nicID tcpip.NICID + addr tcpip.AddressWithPrefix + eventType ndpAutoGenAddrEventType +} + +type ndpRDNSS struct { + addrs []tcpip.Address + lifetime time.Duration +} + +type ndpRDNSSEvent struct { + nicID tcpip.NICID + rdnss ndpRDNSS +} + var _ stack.NDPDispatcher = (*ndpDispatcher)(nil) // ndpDispatcher implements NDPDispatcher so tests can know when various NDP @@ -113,7 +172,8 @@ type ndpDispatcher struct { rememberRouter bool prefixC chan ndpPrefixEvent rememberPrefix bool - routeTable []tcpip.Route + autoGenAddrC chan ndpAutoGenAddrEvent + rdnssC chan ndpRDNSSEvent } // Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus. @@ -129,101 +189,95 @@ func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, add } // Implements stack.NDPDispatcher.OnDefaultRouterDiscovered. -func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route) { - if n.routerC != nil { - n.routerC <- ndpRouterEvent{ +func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool { + if c := n.routerC; c != nil { + c <- ndpRouterEvent{ nicID, addr, true, } } - if !n.rememberRouter { - return false, nil - } - - rt := append([]tcpip.Route(nil), n.routeTable...) - rt = append(rt, tcpip.Route{ - Destination: header.IPv6EmptySubnet, - Gateway: addr, - NIC: nicID, - }) - n.routeTable = rt - return true, rt + return n.rememberRouter } // Implements stack.NDPDispatcher.OnDefaultRouterInvalidated. -func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) []tcpip.Route { - if n.routerC != nil { - n.routerC <- ndpRouterEvent{ +func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) { + if c := n.routerC; c != nil { + c <- ndpRouterEvent{ nicID, addr, false, } } - - var rt []tcpip.Route - exclude := tcpip.Route{ - Destination: header.IPv6EmptySubnet, - Gateway: addr, - NIC: nicID, - } - - for _, r := range n.routeTable { - if r != exclude { - rt = append(rt, r) - } - } - n.routeTable = rt - return rt } // Implements stack.NDPDispatcher.OnOnLinkPrefixDiscovered. -func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) (bool, []tcpip.Route) { - if n.prefixC != nil { - n.prefixC <- ndpPrefixEvent{ +func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool { + if c := n.prefixC; c != nil { + c <- ndpPrefixEvent{ nicID, prefix, true, } } - if !n.rememberPrefix { - return false, nil - } - - rt := append([]tcpip.Route(nil), n.routeTable...) - rt = append(rt, tcpip.Route{ - Destination: prefix, - NIC: nicID, - }) - n.routeTable = rt - return true, rt + return n.rememberPrefix } // Implements stack.NDPDispatcher.OnOnLinkPrefixInvalidated. -func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) []tcpip.Route { - if n.prefixC != nil { - n.prefixC <- ndpPrefixEvent{ +func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) { + if c := n.prefixC; c != nil { + c <- ndpPrefixEvent{ nicID, prefix, false, } } +} - rt := make([]tcpip.Route, 0) - exclude := tcpip.Route{ - Destination: prefix, - NIC: nicID, +func (n *ndpDispatcher) OnAutoGenAddress(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) bool { + if c := n.autoGenAddrC; c != nil { + c <- ndpAutoGenAddrEvent{ + nicID, + addr, + newAddr, + } } + return true +} + +func (n *ndpDispatcher) OnAutoGenAddressDeprecated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) { + if c := n.autoGenAddrC; c != nil { + c <- ndpAutoGenAddrEvent{ + nicID, + addr, + deprecatedAddr, + } + } +} - for _, r := range n.routeTable { - if r != exclude { - rt = append(rt, r) +func (n *ndpDispatcher) OnAutoGenAddressInvalidated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) { + if c := n.autoGenAddrC; c != nil { + c <- ndpAutoGenAddrEvent{ + nicID, + addr, + invalidatedAddr, + } + } +} + +// Implements stack.NDPDispatcher.OnRecursiveDNSServerOption. +func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration) { + if c := n.rdnssC; c != nil { + c <- ndpRDNSSEvent{ + nicID, + ndpRDNSS{ + addrs, + lifetime, + }, } } - n.routeTable = rt - return rt } // TestDADResolve tests that an address successfully resolves after performing @@ -246,7 +300,11 @@ func TestDADResolve(t *testing.T) { } for _, test := range tests { + test := test + t.Run(test.name, func(t *testing.T) { + t.Parallel() + ndpDisp := ndpDispatcher{ dadC: make(chan ndpDADEvent), } @@ -434,7 +492,7 @@ func TestDADFail(t *testing.T) { } opts.NDPConfigs.RetransmitTimer = time.Second * 2 - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(opts) if err := s.CreateNIC(1, e); err != nil { t.Fatalf("CreateNIC(_) = %s", err) @@ -515,7 +573,7 @@ func TestDADStop(t *testing.T) { NDPConfigs: ndpConfigs, } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(opts) if err := s.CreateNIC(1, e); err != nil { t.Fatalf("CreateNIC(_) = %s", err) @@ -616,7 +674,7 @@ func TestSetNDPConfigurations(t *testing.T) { ndpDisp := ndpDispatcher{ dadC: make(chan ndpDADEvent), } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, NDPDisp: &ndpDisp, @@ -781,16 +839,33 @@ func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer { // // Note, raBufWithPI does not populate any of the RA fields other than the // Router Lifetime. -func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink bool, vl uint32) tcpip.PacketBuffer { +func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) tcpip.PacketBuffer { flags := uint8(0) if onLink { - flags |= 128 + // The OnLink flag is the 7th bit in the flags byte. + flags |= 1 << 7 + } + if auto { + // The Address Auto-Configuration flag is the 6th bit in the + // flags byte. + flags |= 1 << 6 } + // A valid header.NDPPrefixInformation must be 30 bytes. buf := [30]byte{} + // The first byte in a header.NDPPrefixInformation is the Prefix Length + // field. buf[0] = uint8(prefix.PrefixLen) + // The 2nd byte within a header.NDPPrefixInformation is the Flags field. buf[1] = flags + // The Valid Lifetime field starts after the 2nd byte within a + // header.NDPPrefixInformation. binary.BigEndian.PutUint32(buf[2:], vl) + // The Preferred Lifetime field starts after the 6th byte within a + // header.NDPPrefixInformation. + binary.BigEndian.PutUint32(buf[6:], pl) + // The Prefix Address field starts after the 14th byte within a + // header.NDPPrefixInformation. copy(buf[14:], prefix.Address) return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{ header.NDPPrefixInformation(buf[:]), @@ -812,10 +887,12 @@ func TestNoRouterDiscovery(t *testing.T) { forwarding := i&4 == 0 t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverDefaultRouters(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) { + t.Parallel() + ndpDisp := ndpDispatcher{ - routerC: make(chan ndpRouterEvent, 10), + routerC: make(chan ndpRouterEvent, 1), } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, NDPConfigs: stack.NDPConfigurations{ @@ -835,19 +912,27 @@ func TestNoRouterDiscovery(t *testing.T) { select { case <-ndpDisp.routerC: t.Fatal("unexpectedly discovered a router when configured not to") - case <-time.After(defaultTimeout): + default: } }) } } +// Check e to make sure that the event is for addr on nic with ID 1, and the +// discovered flag set to discovered. +func checkRouterEvent(e ndpRouterEvent, addr tcpip.Address, discovered bool) string { + return cmp.Diff(ndpRouterEvent{nicID: 1, addr: addr, discovered: discovered}, e, cmp.AllowUnexported(e)) +} + // TestRouterDiscoveryDispatcherNoRemember tests that the stack does not // remember a discovered router when the dispatcher asks it not to. func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) { + t.Parallel() + ndpDisp := ndpDispatcher{ - routerC: make(chan ndpRouterEvent, 10), + routerC: make(chan ndpRouterEvent, 1), } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, NDPConfigs: stack.NDPConfigurations{ @@ -861,59 +946,36 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) { t.Fatalf("CreateNIC(1) = %s", err) } - routeTable := []tcpip.Route{ - { - header.IPv6EmptySubnet, - llAddr3, - 1, - }, - } - s.SetRouteTable(routeTable) - - // Rx an RA with short lifetime. - lifetime := time.Duration(1) - e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, uint16(lifetime))) + // Receive an RA for a router we should not remember. + const lifetimeSeconds = 1 + e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, lifetimeSeconds)) select { - case r := <-ndpDisp.routerC: - if r.nicID != 1 { - t.Fatalf("got r.nicID = %d, want = 1", r.nicID) - } - if r.addr != llAddr2 { - t.Fatalf("got r.addr = %s, want = %s", r.addr, llAddr2) + case e := <-ndpDisp.routerC: + if diff := checkRouterEvent(e, llAddr2, true); diff != "" { + t.Errorf("router event mismatch (-want +got):\n%s", diff) } - if !r.discovered { - t.Fatal("got r.discovered = false, want = true") - } - case <-time.After(defaultTimeout): - t.Fatal("timeout waiting for router discovery event") - } - - // Original route table should not have been modified. - if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable) + default: + t.Fatal("expected router discovery event") } - // Wait for the normal invalidation time plus an extra second to - // make sure we do not actually receive any invalidation events as - // we should not have remembered the router in the first place. + // Wait for the invalidation time plus some buffer to make sure we do + // not actually receive any invalidation events as we should not have + // remembered the router in the first place. select { case <-ndpDisp.routerC: t.Fatal("should not have received any router events") - case <-time.After(lifetime*time.Second + defaultTimeout): - } - - // Original route table should not have been modified. - if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable) + case <-time.After(lifetimeSeconds*time.Second + defaultTimeout): } } func TestRouterDiscovery(t *testing.T) { + t.Parallel() + ndpDisp := ndpDispatcher{ - routerC: make(chan ndpRouterEvent, 10), + routerC: make(chan ndpRouterEvent, 1), rememberRouter: true, } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, NDPConfigs: stack.NDPConfigurations{ @@ -923,22 +985,29 @@ func TestRouterDiscovery(t *testing.T) { NDPDisp: &ndpDisp, }) - waitForEvent := func(addr tcpip.Address, discovered bool, timeout time.Duration) { + expectRouterEvent := func(addr tcpip.Address, discovered bool) { t.Helper() select { - case r := <-ndpDisp.routerC: - if r.nicID != 1 { - t.Fatalf("got r.nicID = %d, want = 1", r.nicID) + case e := <-ndpDisp.routerC: + if diff := checkRouterEvent(e, addr, discovered); diff != "" { + t.Errorf("router event mismatch (-want +got):\n%s", diff) } - if r.addr != addr { - t.Fatalf("got r.addr = %s, want = %s", r.addr, addr) - } - if r.discovered != discovered { - t.Fatalf("got r.discovered = %t, want = %t", r.discovered, discovered) + default: + t.Fatal("expected router discovery event") + } + } + + expectAsyncRouterInvalidationEvent := func(addr tcpip.Address, timeout time.Duration) { + t.Helper() + + select { + case e := <-ndpDisp.routerC: + if diff := checkRouterEvent(e, addr, false); diff != "" { + t.Errorf("router event mismatch (-want +got):\n%s", diff) } case <-time.After(timeout): - t.Fatal("timeout waiting for router discovery event") + t.Fatal("timed out waiting for router discovery event") } } @@ -952,27 +1021,17 @@ func TestRouterDiscovery(t *testing.T) { select { case <-ndpDisp.routerC: t.Fatal("unexpectedly discovered a router with 0 lifetime") - case <-time.After(defaultTimeout): + default: } // Rx an RA from lladdr2 with a huge lifetime. e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000)) - waitForEvent(llAddr2, true, defaultTimeout) - - // Should have a default route through the discovered router. - if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) - } + expectRouterEvent(llAddr2, true) // Rx an RA from another router (lladdr3) with non-zero lifetime. l3Lifetime := time.Duration(6) e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr3, uint16(l3Lifetime))) - waitForEvent(llAddr3, true, defaultTimeout) - - // Should have default routes through the discovered routers. - if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}, {header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) - } + expectRouterEvent(llAddr3, true) // Rx an RA from lladdr2 with lesser lifetime. l2Lifetime := time.Duration(2) @@ -980,12 +1039,7 @@ func TestRouterDiscovery(t *testing.T) { select { case <-ndpDisp.routerC: t.Fatal("Should not receive a router event when updating lifetimes for known routers") - case <-time.After(defaultTimeout): - } - - // Should still have a default route through the discovered routers. - if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}, {header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) + default: } // Wait for lladdr2's router invalidation timer to fire. The lifetime @@ -995,31 +1049,15 @@ func TestRouterDiscovery(t *testing.T) { // Wait for the normal lifetime plus an extra bit for the // router to get invalidated. If we don't get an invalidation // event after this time, then something is wrong. - waitForEvent(llAddr2, false, l2Lifetime*time.Second+defaultTimeout) - - // Should no longer have the default route through lladdr2. - if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) - } + expectAsyncRouterInvalidationEvent(llAddr2, l2Lifetime*time.Second+defaultTimeout) // Rx an RA from lladdr2 with huge lifetime. e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000)) - waitForEvent(llAddr2, true, defaultTimeout) - - // Should have a default route through the discovered routers. - if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}, {header.IPv6EmptySubnet, llAddr2, 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) - } + expectRouterEvent(llAddr2, true) // Rx an RA from lladdr2 with zero lifetime. It should be invalidated. e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0)) - waitForEvent(llAddr2, false, defaultTimeout) - - // Should have deleted the default route through the router that just - // got invalidated. - if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) - } + expectRouterEvent(llAddr2, false) // Wait for lladdr3's router invalidation timer to fire. The lifetime // of the router should have been updated to the most recent (smaller) @@ -1028,23 +1066,19 @@ func TestRouterDiscovery(t *testing.T) { // Wait for the normal lifetime plus an extra bit for the // router to get invalidated. If we don't get an invalidation // event after this time, then something is wrong. - waitForEvent(llAddr3, false, l3Lifetime*time.Second+defaultTimeout) - - // Should not have any routes now that all discovered routers have been - // invalidated. - if got := len(s.GetRouteTable()); got != 0 { - t.Fatalf("got len(s.GetRouteTable()) = %d, want = 0", got) - } + expectAsyncRouterInvalidationEvent(llAddr3, l3Lifetime*time.Second+defaultTimeout) } // TestRouterDiscoveryMaxRouters tests that only // stack.MaxDiscoveredDefaultRouters discovered routers are remembered. func TestRouterDiscoveryMaxRouters(t *testing.T) { + t.Parallel() + ndpDisp := ndpDispatcher{ - routerC: make(chan ndpRouterEvent, 10), + routerC: make(chan ndpRouterEvent, 1), rememberRouter: true, } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, NDPConfigs: stack.NDPConfigurations{ @@ -1058,8 +1092,6 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) { t.Fatalf("CreateNIC(1) = %s", err) } - expectedRt := [stack.MaxDiscoveredDefaultRouters]tcpip.Route{} - // Receive an RA from 2 more than the max number of discovered routers. for i := 1; i <= stack.MaxDiscoveredDefaultRouters+2; i++ { linkAddr := []byte{2, 2, 3, 4, 5, 0} @@ -1069,36 +1101,23 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) { e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr, 5)) if i <= stack.MaxDiscoveredDefaultRouters { - expectedRt[i-1] = tcpip.Route{header.IPv6EmptySubnet, llAddr, 1} select { - case r := <-ndpDisp.routerC: - if r.nicID != 1 { - t.Fatalf("got r.nicID = %d, want = 1", r.nicID) + case e := <-ndpDisp.routerC: + if diff := checkRouterEvent(e, llAddr, true); diff != "" { + t.Errorf("router event mismatch (-want +got):\n%s", diff) } - if r.addr != llAddr { - t.Fatalf("got r.addr = %s, want = %s", r.addr, llAddr) - } - if !r.discovered { - t.Fatal("got r.discovered = false, want = true") - } - case <-time.After(defaultTimeout): - t.Fatal("timeout waiting for router discovery event") + default: + t.Fatal("expected router discovery event") } } else { select { case <-ndpDisp.routerC: t.Fatal("should not have discovered a new router after we already discovered the max number of routers") - case <-time.After(defaultTimeout): + default: } } } - - // Should only have default routes for the first - // stack.MaxDiscoveredDefaultRouters discovered routers. - if got := s.GetRouteTable(); !cmp.Equal(got, expectedRt[:]) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, expectedRt) - } } // TestNoPrefixDiscovery tests that prefix discovery will not be performed if @@ -1121,10 +1140,12 @@ func TestNoPrefixDiscovery(t *testing.T) { forwarding := i&4 == 0 t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverOnLinkPrefixes(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) { + t.Parallel() + ndpDisp := ndpDispatcher{ - prefixC: make(chan ndpPrefixEvent, 10), + prefixC: make(chan ndpPrefixEvent, 1), } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, NDPConfigs: stack.NDPConfigurations{ @@ -1140,30 +1161,34 @@ func TestNoPrefixDiscovery(t *testing.T) { } // Rx an RA with prefix with non-zero lifetime. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, 10)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, 10, 0)) select { case <-ndpDisp.prefixC: t.Fatal("unexpectedly discovered a prefix when configured not to") - case <-time.After(defaultTimeout): + default: } }) } } +// Check e to make sure that the event is for prefix on nic with ID 1, and the +// discovered flag set to discovered. +func checkPrefixEvent(e ndpPrefixEvent, prefix tcpip.Subnet, discovered bool) string { + return cmp.Diff(ndpPrefixEvent{nicID: 1, prefix: prefix, discovered: discovered}, e, cmp.AllowUnexported(e)) +} + // TestPrefixDiscoveryDispatcherNoRemember tests that the stack does not // remember a discovered on-link prefix when the dispatcher asks it not to. func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) { - prefix := tcpip.AddressWithPrefix{ - Address: tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"), - PrefixLen: 64, - } - subnet := prefix.Subnet() + t.Parallel() + + prefix, subnet, _ := prefixSubnetAddr(0, "") ndpDisp := ndpDispatcher{ - prefixC: make(chan ndpPrefixEvent, 10), + prefixC: make(chan ndpPrefixEvent, 1), } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, NDPConfigs: stack.NDPConfigurations{ @@ -1178,75 +1203,40 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) { t.Fatalf("CreateNIC(1) = %s", err) } - routeTable := []tcpip.Route{ - { - header.IPv6EmptySubnet, - llAddr3, - 1, - }, - } - s.SetRouteTable(routeTable) - - // Rx an RA with prefix with a short lifetime. - const lifetime = 1 - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, lifetime)) + // Receive an RA with prefix that we should not remember. + const lifetimeSeconds = 1 + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, lifetimeSeconds, 0)) select { - case r := <-ndpDisp.prefixC: - if r.nicID != 1 { - t.Fatalf("got r.nicID = %d, want = 1", r.nicID) - } - if r.prefix != subnet { - t.Fatalf("got r.prefix = %s, want = %s", r.prefix, subnet) + case e := <-ndpDisp.prefixC: + if diff := checkPrefixEvent(e, subnet, true); diff != "" { + t.Errorf("prefix event mismatch (-want +got):\n%s", diff) } - if !r.discovered { - t.Fatal("got r.discovered = false, want = true") - } - case <-time.After(defaultTimeout): - t.Fatal("timeout waiting for prefix discovery event") - } - - // Original route table should not have been modified. - if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable) + default: + t.Fatal("expected prefix discovery event") } - // Wait for the normal invalidation time plus some buffer to - // make sure we do not actually receive any invalidation events as - // we should not have remembered the prefix in the first place. + // Wait for the invalidation time plus some buffer to make sure we do + // not actually receive any invalidation events as we should not have + // remembered the prefix in the first place. select { case <-ndpDisp.prefixC: t.Fatal("should not have received any prefix events") - case <-time.After(lifetime*time.Second + defaultTimeout): - } - - // Original route table should not have been modified. - if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable) + case <-time.After(lifetimeSeconds*time.Second + defaultTimeout): } } func TestPrefixDiscovery(t *testing.T) { - prefix1 := tcpip.AddressWithPrefix{ - Address: tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"), - PrefixLen: 64, - } - prefix2 := tcpip.AddressWithPrefix{ - Address: tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x09\x00\x00\x00\x00\x00\x00\x00\x00"), - PrefixLen: 64, - } - prefix3 := tcpip.AddressWithPrefix{ - Address: tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x09\x0a\x00\x00\x00\x00\x00\x00\x00"), - PrefixLen: 72, - } - subnet1 := prefix1.Subnet() - subnet2 := prefix2.Subnet() - subnet3 := prefix3.Subnet() + t.Parallel() + + prefix1, subnet1, _ := prefixSubnetAddr(0, "") + prefix2, subnet2, _ := prefixSubnetAddr(1, "") + prefix3, subnet3, _ := prefixSubnetAddr(2, "") ndpDisp := ndpDispatcher{ - prefixC: make(chan ndpPrefixEvent, 10), + prefixC: make(chan ndpPrefixEvent, 1), rememberPrefix: true, } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, NDPConfigs: stack.NDPConfigurations{ @@ -1256,106 +1246,72 @@ func TestPrefixDiscovery(t *testing.T) { NDPDisp: &ndpDisp, }) - waitForEvent := func(subnet tcpip.Subnet, discovered bool, timeout time.Duration) { + if err := s.CreateNIC(1, e); err != nil { + t.Fatalf("CreateNIC(1) = %s", err) + } + + expectPrefixEvent := func(prefix tcpip.Subnet, discovered bool) { t.Helper() select { - case r := <-ndpDisp.prefixC: - if r.nicID != 1 { - t.Fatalf("got r.nicID = %d, want = 1", r.nicID) - } - if r.prefix != subnet { - t.Fatalf("got r.prefix = %s, want = %s", r.prefix, subnet) + case e := <-ndpDisp.prefixC: + if diff := checkPrefixEvent(e, prefix, discovered); diff != "" { + t.Errorf("prefix event mismatch (-want +got):\n%s", diff) } - if r.discovered != discovered { - t.Fatalf("got r.discovered = %t, want = %t", r.discovered, discovered) - } - case <-time.After(timeout): - t.Fatal("timeout waiting for prefix discovery event") + default: + t.Fatal("expected prefix discovery event") } } - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(1) = %s", err) - } - // Receive an RA with prefix1 in an NDP Prefix Information option (PI) // with zero valid lifetime. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 0)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0)) select { case <-ndpDisp.prefixC: t.Fatal("unexpectedly discovered a prefix with 0 lifetime") - case <-time.After(defaultTimeout): + default: } // Receive an RA with prefix1 in an NDP Prefix Information option (PI) // with non-zero lifetime. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 100)) - waitForEvent(subnet1, true, defaultTimeout) - - // Should have added a device route for subnet1 through the nic. - if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) - } + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 100, 0)) + expectPrefixEvent(subnet1, true) // Receive an RA with prefix2 in a PI. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, 100)) - waitForEvent(subnet2, true, defaultTimeout) - - // Should have added a device route for subnet2 through the nic. - if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) - } + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, 100, 0)) + expectPrefixEvent(subnet2, true) // Receive an RA with prefix3 in a PI. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, 100)) - waitForEvent(subnet3, true, defaultTimeout) - - // Should have added a device route for subnet3 through the nic. - if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) - } + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 100, 0)) + expectPrefixEvent(subnet3, true) // Receive an RA with prefix1 in a PI with lifetime = 0. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 0)) - waitForEvent(subnet1, false, defaultTimeout) - - // Should have removed the device route for subnet1 through the nic. - if got, want := s.GetRouteTable(), []tcpip.Route{{subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) - } + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0)) + expectPrefixEvent(subnet1, false) // Receive an RA with prefix2 in a PI with lesser lifetime. lifetime := uint32(2) - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, lifetime)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, lifetime, 0)) select { case <-ndpDisp.prefixC: t.Fatal("unexpectedly received prefix event when updating lifetime") - case <-time.After(defaultTimeout): - } - - // Should not have updated route table. - if got, want := s.GetRouteTable(), []tcpip.Route{{subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) + default: } // Wait for prefix2's most recent invalidation timer plus some buffer to // expire. - waitForEvent(subnet2, false, time.Duration(lifetime)*time.Second+defaultTimeout) - - // Should have removed the device route for subnet2 through the nic. - if got, want := s.GetRouteTable(), []tcpip.Route{{subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, want) + select { + case e := <-ndpDisp.prefixC: + if diff := checkPrefixEvent(e, subnet2, false); diff != "" { + t.Errorf("prefix event mismatch (-want +got):\n%s", diff) + } + case <-time.After(time.Duration(lifetime)*time.Second + defaultTimeout): + t.Fatal("timed out waiting for prefix discovery event") } // Receive RA to invalidate prefix3. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, 0)) - waitForEvent(subnet3, false, defaultTimeout) - - // Should not have any routes. - if got := len(s.GetRouteTable()); got != 0 { - t.Fatalf("got len(s.GetRouteTable()) = %d, want = 0", got) - } + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 0, 0)) + expectPrefixEvent(subnet3, false) } func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { @@ -1364,10 +1320,10 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { // invalidate the prefix. const testInfiniteLifetimeSeconds = 2 const testInfiniteLifetime = testInfiniteLifetimeSeconds * time.Second - saved := header.NDPPrefixInformationInfiniteLifetime - header.NDPPrefixInformationInfiniteLifetime = testInfiniteLifetime + saved := header.NDPInfiniteLifetime + header.NDPInfiniteLifetime = testInfiniteLifetime defer func() { - header.NDPPrefixInformationInfiniteLifetime = saved + header.NDPInfiniteLifetime = saved }() prefix := tcpip.AddressWithPrefix{ @@ -1377,10 +1333,10 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { subnet := prefix.Subnet() ndpDisp := ndpDispatcher{ - prefixC: make(chan ndpPrefixEvent, 10), + prefixC: make(chan ndpPrefixEvent, 1), rememberPrefix: true, } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, NDPConfigs: stack.NDPConfigurations{ @@ -1390,33 +1346,27 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { NDPDisp: &ndpDisp, }) - waitForEvent := func(discovered bool, timeout time.Duration) { + if err := s.CreateNIC(1, e); err != nil { + t.Fatalf("CreateNIC(1) = %s", err) + } + + expectPrefixEvent := func(prefix tcpip.Subnet, discovered bool) { t.Helper() select { - case r := <-ndpDisp.prefixC: - if r.nicID != 1 { - t.Errorf("got r.nicID = %d, want = 1", r.nicID) - } - if r.prefix != subnet { - t.Errorf("got r.prefix = %s, want = %s", r.prefix, subnet) + case e := <-ndpDisp.prefixC: + if diff := checkPrefixEvent(e, prefix, discovered); diff != "" { + t.Errorf("prefix event mismatch (-want +got):\n%s", diff) } - if r.discovered != discovered { - t.Errorf("got r.discovered = %t, want = %t", r.discovered, discovered) - } - case <-time.After(timeout): - t.Fatal("timeout waiting for prefix discovery event") + default: + t.Fatal("expected prefix discovery event") } } - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(1) = %s", err) - } - // Receive an RA with prefix in an NDP Prefix Information option (PI) // with infinite valid lifetime which should not get invalidated. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds)) - waitForEvent(true, defaultTimeout) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0)) + expectPrefixEvent(subnet, true) select { case <-ndpDisp.prefixC: t.Fatal("unexpectedly invalidated a prefix with infinite lifetime") @@ -1425,16 +1375,23 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { // Receive an RA with finite lifetime. // The prefix should get invalidated after 1s. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds-1)) - waitForEvent(false, testInfiniteLifetime) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0)) + select { + case e := <-ndpDisp.prefixC: + if diff := checkPrefixEvent(e, subnet, false); diff != "" { + t.Errorf("prefix event mismatch (-want +got):\n%s", diff) + } + case <-time.After(testInfiniteLifetime): + t.Fatal("timed out waiting for prefix discovery event") + } // Receive an RA with finite lifetime. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds-1)) - waitForEvent(true, defaultTimeout) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0)) + expectPrefixEvent(subnet, true) // Receive an RA with prefix with an infinite lifetime. // The prefix should not be invalidated. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0)) select { case <-ndpDisp.prefixC: t.Fatal("unexpectedly invalidated a prefix with infinite lifetime") @@ -1443,7 +1400,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { // Receive an RA with a prefix with a lifetime value greater than the // set infinite lifetime value. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds+1)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds+1, 0)) select { case <-ndpDisp.prefixC: t.Fatal("unexpectedly invalidated a prefix with infinite lifetime") @@ -1452,18 +1409,20 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { // Receive an RA with 0 lifetime. // The prefix should get invalidated. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, 0)) - waitForEvent(false, defaultTimeout) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, 0, 0)) + expectPrefixEvent(subnet, false) } // TestPrefixDiscoveryMaxRouters tests that only // stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered. func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) { + t.Parallel() + ndpDisp := ndpDispatcher{ prefixC: make(chan ndpPrefixEvent, stack.MaxDiscoveredOnLinkPrefixes+3), rememberPrefix: true, } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, NDPConfigs: stack.NDPConfigurations{ @@ -1479,7 +1438,6 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) { } optSer := make(header.NDPOptionsSerializer, stack.MaxDiscoveredOnLinkPrefixes+2) - expectedRt := [stack.MaxDiscoveredOnLinkPrefixes]tcpip.Route{} prefixes := [stack.MaxDiscoveredOnLinkPrefixes + 2]tcpip.Subnet{} // Receive an RA with 2 more than the max number of discovered on-link @@ -1499,41 +1457,1500 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) { copy(buf[14:], prefix.Address) optSer[i] = header.NDPPrefixInformation(buf[:]) - - if i < stack.MaxDiscoveredOnLinkPrefixes { - expectedRt[i] = tcpip.Route{prefixes[i], tcpip.Address([]byte(nil)), 1} - } } e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, optSer)) for i := 0; i < stack.MaxDiscoveredOnLinkPrefixes+2; i++ { if i < stack.MaxDiscoveredOnLinkPrefixes { select { - case r := <-ndpDisp.prefixC: - if r.nicID != 1 { - t.Fatalf("got r.nicID = %d, want = 1", r.nicID) - } - if r.prefix != prefixes[i] { - t.Fatalf("got r.prefix = %s, want = %s", r.prefix, prefixes[i]) + case e := <-ndpDisp.prefixC: + if diff := checkPrefixEvent(e, prefixes[i], true); diff != "" { + t.Errorf("prefix event mismatch (-want +got):\n%s", diff) } - if !r.discovered { - t.Fatal("got r.discovered = false, want = true") - } - case <-time.After(defaultTimeout): - t.Fatal("timeout waiting for prefix discovery event") + default: + t.Fatal("expected prefix discovery event") } } else { select { case <-ndpDisp.prefixC: t.Fatal("should not have discovered a new prefix after we already discovered the max number of prefixes") + default: + } + } + } +} + +// Checks to see if list contains an IPv6 address, item. +func contains(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool { + protocolAddress := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: item, + } + + for _, i := range list { + if i == protocolAddress { + return true + } + } + + return false +} + +// TestNoAutoGenAddr tests that SLAAC is not performed when configured not to. +func TestNoAutoGenAddr(t *testing.T) { + prefix, _, _ := prefixSubnetAddr(0, "") + + // Being configured to auto-generate addresses means handle and + // autogen are set to true and forwarding is set to false. + // This tests all possible combinations of the configurations, + // except for the configuration where handle = true, autogen = + // true and forwarding = false (the required configuration to do + // SLAAC) - that will done in other tests. + for i := 0; i < 7; i++ { + handle := i&1 != 0 + autogen := i&2 != 0 + forwarding := i&4 == 0 + + t.Run(fmt.Sprintf("HandleRAs(%t), AutoGenAddr(%t), Forwarding(%t)", handle, autogen, forwarding), func(t *testing.T) { + t.Parallel() + + ndpDisp := ndpDispatcher{ + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), + } + e := channel.New(0, 1280, linkAddr1) + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NDPConfigs: stack.NDPConfigurations{ + HandleRAs: handle, + AutoGenGlobalAddresses: autogen, + }, + NDPDisp: &ndpDisp, + }) + s.SetForwarding(forwarding) + + if err := s.CreateNIC(1, e); err != nil { + t.Fatalf("CreateNIC(1) = %s", err) + } + + // Rx an RA with prefix with non-zero lifetime. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, false, true, 10, 0)) + + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly auto-generated an address when configured not to") + default: + } + }) + } +} + +// Check e to make sure that the event is for addr on nic with ID 1, and the +// event type is set to eventType. +func checkAutoGenAddrEvent(e ndpAutoGenAddrEvent, addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) string { + return cmp.Diff(ndpAutoGenAddrEvent{nicID: 1, addr: addr, eventType: eventType}, e, cmp.AllowUnexported(e)) +} + +// TestAutoGenAddr tests that an address is properly generated and invalidated +// when configured to do so. +func TestAutoGenAddr(t *testing.T) { + const newMinVL = 2 + newMinVLDuration := newMinVL * time.Second + saved := stack.MinPrefixInformationValidLifetimeForUpdate + defer func() { + stack.MinPrefixInformationValidLifetimeForUpdate = saved + }() + stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration + + prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1) + prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1) + + ndpDisp := ndpDispatcher{ + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), + } + e := channel.New(0, 1280, linkAddr1) + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NDPConfigs: stack.NDPConfigurations{ + HandleRAs: true, + AutoGenGlobalAddresses: true, + }, + NDPDisp: &ndpDisp, + }) + + if err := s.CreateNIC(1, e); err != nil { + t.Fatalf("CreateNIC(1) = %s", err) + } + + expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { + t.Helper() + + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("expected addr auto gen event") + } + } + + // Receive an RA with prefix1 in an NDP Prefix Information option (PI) + // with zero valid lifetime. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 0, 0)) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly auto-generated an address with 0 lifetime") + default: + } + + // Receive an RA with prefix1 in an NDP Prefix Information option (PI) + // with non-zero lifetime. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0)) + expectAutoGenAddrEvent(addr1, newAddr) + if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) { + t.Fatalf("Should have %s in the list of addresses", addr1) + } + + // Receive an RA with prefix2 in an NDP Prefix Information option (PI) + // with preferred lifetime > valid lifetime + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 5, 6)) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly auto-generated an address with preferred lifetime > valid lifetime") + default: + } + + // Receive an RA with prefix2 in a PI. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0)) + expectAutoGenAddrEvent(addr2, newAddr) + if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) { + t.Fatalf("Should have %s in the list of addresses", addr1) + } + if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) { + t.Fatalf("Should have %s in the list of addresses", addr2) + } + + // Refresh valid lifetime for addr of prefix1. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0)) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly auto-generated an address when we already have an address for a prefix") + default: + } + + // Wait for addr of prefix1 to be invalidated. + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + case <-time.After(newMinVLDuration + defaultTimeout): + t.Fatal("timed out waiting for addr auto gen event") + } + if contains(s.NICInfo()[1].ProtocolAddresses, addr1) { + t.Fatalf("Should not have %s in the list of addresses", addr1) + } + if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) { + t.Fatalf("Should have %s in the list of addresses", addr2) + } +} + +// stackAndNdpDispatcherWithDefaultRoute returns an ndpDispatcher, +// channel.Endpoint and stack.Stack. +// +// stack.Stack will have a default route through the router (llAddr3) installed +// and a static link-address (linkAddr3) added to the link address cache for the +// router. +func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*ndpDispatcher, *channel.Endpoint, *stack.Stack) { + t.Helper() + ndpDisp := &ndpDispatcher{ + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), + } + e := channel.New(0, 1280, linkAddr1) + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}, + NDPConfigs: stack.NDPConfigurations{ + HandleRAs: true, + AutoGenGlobalAddresses: true, + }, + NDPDisp: ndpDisp, + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + s.SetRouteTable([]tcpip.Route{{ + Destination: header.IPv6EmptySubnet, + Gateway: llAddr3, + NIC: nicID, + }}) + s.AddLinkAddress(nicID, llAddr3, linkAddr3) + return ndpDisp, e, s +} + +// addrForNewConnection returns the local address used when creating a new +// connection. +func addrForNewConnection(t *testing.T, s *stack.Stack) tcpip.Address { + wq := waiter.Queue{} + we, ch := waiter.NewChannelEntry(nil) + wq.EventRegister(&we, waiter.EventIn) + defer wq.EventUnregister(&we) + defer close(ch) + ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq) + if err != nil { + t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err) + } + defer ep.Close() + v := tcpip.V6OnlyOption(1) + if err := ep.SetSockOpt(v); err != nil { + t.Fatalf("SetSockOpt(%+v): %s", v, err) + } + if err := ep.Connect(dstAddr); err != nil { + t.Fatalf("ep.Connect(%+v): %s", dstAddr, err) + } + got, err := ep.GetLocalAddress() + if err != nil { + t.Fatalf("ep.GetLocalAddress(): %s", err) + } + return got.Addr +} + +// addrForNewConnectionWithAddr returns the local address used when creating a +// new connection with a specific local address. +func addrForNewConnectionWithAddr(t *testing.T, s *stack.Stack, addr tcpip.FullAddress) tcpip.Address { + wq := waiter.Queue{} + we, ch := waiter.NewChannelEntry(nil) + wq.EventRegister(&we, waiter.EventIn) + defer wq.EventUnregister(&we) + defer close(ch) + ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq) + if err != nil { + t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err) + } + defer ep.Close() + v := tcpip.V6OnlyOption(1) + if err := ep.SetSockOpt(v); err != nil { + t.Fatalf("SetSockOpt(%+v): %s", v, err) + } + if err := ep.Bind(addr); err != nil { + t.Fatalf("ep.Bind(%+v): %s", addr, err) + } + if err := ep.Connect(dstAddr); err != nil { + t.Fatalf("ep.Connect(%+v): %s", dstAddr, err) + } + got, err := ep.GetLocalAddress() + if err != nil { + t.Fatalf("ep.GetLocalAddress(): %s", err) + } + return got.Addr +} + +// TestAutoGenAddrDeprecateFromPI tests deprecating a SLAAC address when +// receiving a PI with 0 preferred lifetime. +func TestAutoGenAddrDeprecateFromPI(t *testing.T) { + const nicID = 1 + + prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1) + prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1) + + ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID) + + expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { + t.Helper() + + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("expected addr auto gen event") + } + } + + expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) { + t.Helper() + + if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil { + t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err) + } else if got != addr { + t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr) + } + + if got := addrForNewConnection(t, s); got != addr.Address { + t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address) + } + } + + // Receive PI for prefix1. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100)) + expectAutoGenAddrEvent(addr1, newAddr) + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + t.Fatalf("should have %s in the list of addresses", addr1) + } + expectPrimaryAddr(addr1) + + // Deprecate addr for prefix1 immedaitely. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0)) + expectAutoGenAddrEvent(addr1, deprecatedAddr) + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + t.Fatalf("should have %s in the list of addresses", addr1) + } + // addr should still be the primary endpoint as there are no other addresses. + expectPrimaryAddr(addr1) + + // Refresh lifetimes of addr generated from prefix1. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100)) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly got an auto-generated event") + default: + } + expectPrimaryAddr(addr1) + + // Receive PI for prefix2. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100)) + expectAutoGenAddrEvent(addr2, newAddr) + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + t.Fatalf("should have %s in the list of addresses", addr2) + } + expectPrimaryAddr(addr2) + + // Deprecate addr for prefix2 immedaitely. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0)) + expectAutoGenAddrEvent(addr2, deprecatedAddr) + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + t.Fatalf("should have %s in the list of addresses", addr2) + } + // addr1 should be the primary endpoint now since addr2 is deprecated but + // addr1 is not. + expectPrimaryAddr(addr1) + // addr2 is deprecated but if explicitly requested, it should be used. + fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID} + if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address { + t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address) + } + + // Another PI w/ 0 preferred lifetime should not result in a deprecation + // event. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0)) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly got an auto-generated event") + default: + } + expectPrimaryAddr(addr1) + if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address { + t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address) + } + + // Refresh lifetimes of addr generated from prefix2. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100)) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly got an auto-generated event") + default: + } + expectPrimaryAddr(addr2) +} + +// TestAutoGenAddrTimerDeprecation tests that an address is properly deprecated +// when its preferred lifetime expires. +func TestAutoGenAddrTimerDeprecation(t *testing.T) { + const nicID = 1 + const newMinVL = 2 + newMinVLDuration := newMinVL * time.Second + saved := stack.MinPrefixInformationValidLifetimeForUpdate + defer func() { + stack.MinPrefixInformationValidLifetimeForUpdate = saved + }() + stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration + + prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1) + prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1) + + ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID) + + expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { + t.Helper() + + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("expected addr auto gen event") + } + } + + expectAutoGenAddrEventAfter := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) { + t.Helper() + + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + case <-time.After(timeout): + t.Fatal("timed out waiting for addr auto gen event") + } + } + + expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) { + t.Helper() + + if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil { + t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err) + } else if got != addr { + t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr) + } + + if got := addrForNewConnection(t, s); got != addr.Address { + t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address) + } + } + + // Receive PI for prefix2. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100)) + expectAutoGenAddrEvent(addr2, newAddr) + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + t.Fatalf("should have %s in the list of addresses", addr2) + } + expectPrimaryAddr(addr2) + + // Receive a PI for prefix1. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90)) + expectAutoGenAddrEvent(addr1, newAddr) + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + t.Fatalf("should have %s in the list of addresses", addr1) + } + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + t.Fatalf("should have %s in the list of addresses", addr2) + } + expectPrimaryAddr(addr1) + + // Refresh lifetime for addr of prefix1. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1)) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly got an auto-generated event") + default: + } + expectPrimaryAddr(addr1) + + // Wait for addr of prefix1 to be deprecated. + expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultTimeout) + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + t.Fatalf("should not have %s in the list of addresses", addr1) + } + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + t.Fatalf("should have %s in the list of addresses", addr2) + } + // addr2 should be the primary endpoint now since addr1 is deprecated but + // addr2 is not. + expectPrimaryAddr(addr2) + // addr1 is deprecated but if explicitly requested, it should be used. + fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID} + if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address { + t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address) + } + + // Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make + // sure we do not get a deprecation event again. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0)) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly got an auto-generated event") + default: + } + expectPrimaryAddr(addr2) + if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address { + t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address) + } + + // Refresh lifetimes for addr of prefix1. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1)) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly got an auto-generated event") + default: + } + // addr1 is the primary endpoint again since it is non-deprecated now. + expectPrimaryAddr(addr1) + + // Wait for addr of prefix1 to be deprecated. + expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultTimeout) + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + t.Fatalf("should not have %s in the list of addresses", addr1) + } + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + t.Fatalf("should have %s in the list of addresses", addr2) + } + // addr2 should be the primary endpoint now since it is not deprecated. + expectPrimaryAddr(addr2) + if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address { + t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address) + } + + // Wait for addr of prefix1 to be invalidated. + expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultTimeout) + if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + t.Fatalf("should not have %s in the list of addresses", addr1) + } + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + t.Fatalf("should have %s in the list of addresses", addr2) + } + expectPrimaryAddr(addr2) + + // Refresh both lifetimes for addr of prefix2 to the same value. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, newMinVL, newMinVL)) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly got an auto-generated event") + default: + } + + // Wait for a deprecation then invalidation events, or just an invalidation + // event. We need to cover both cases but cannot deterministically hit both + // cases because the deprecation and invalidation handlers could be handled in + // either deprecation then invalidation, or invalidation then deprecation + // (which should be cancelled by the invalidation handler). + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr2, deprecatedAddr); diff == "" { + // If we get a deprecation event first, we should get an invalidation + // event almost immediately after. + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + case <-time.After(defaultTimeout): + t.Fatal("timed out waiting for addr auto gen event") + } + } else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" { + // If we get an invalidation event first, we should not get a deprecation + // event after. + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly got an auto-generated event") case <-time.After(defaultTimeout): } + } else { + t.Fatalf("got unexpected auto-generated event") } + + case <-time.After(newMinVLDuration + defaultTimeout): + t.Fatal("timed out waiting for addr auto gen event") + } + if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + t.Fatalf("should not have %s in the list of addresses", addr1) } + if contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + t.Fatalf("should not have %s in the list of addresses", addr2) + } + // Should not have any primary endpoints. + if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil { + t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err) + } else if want := (tcpip.AddressWithPrefix{}); got != want { + t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, want) + } + wq := waiter.Queue{} + we, ch := waiter.NewChannelEntry(nil) + wq.EventRegister(&we, waiter.EventIn) + defer wq.EventUnregister(&we) + defer close(ch) + ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq) + if err != nil { + t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err) + } + defer ep.Close() + v := tcpip.V6OnlyOption(1) + if err := ep.SetSockOpt(v); err != nil { + t.Fatalf("SetSockOpt(%+v): %s", v, err) + } + + if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute { + t.Errorf("got ep.Connect(%+v) = %v, want = %s", dstAddr, err, tcpip.ErrNoRoute) + } +} - // Should only have device routes for the first - // stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes. - if got := s.GetRouteTable(); !cmp.Equal(got, expectedRt[:]) { - t.Fatalf("got GetRouteTable = %v, want = %v", got, expectedRt) +// Tests transitioning a SLAAC address's valid lifetime between finite and +// infinite values. +func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) { + const infiniteVLSeconds = 2 + const minVLSeconds = 1 + savedIL := header.NDPInfiniteLifetime + savedMinVL := stack.MinPrefixInformationValidLifetimeForUpdate + defer func() { + stack.MinPrefixInformationValidLifetimeForUpdate = savedMinVL + header.NDPInfiniteLifetime = savedIL + }() + stack.MinPrefixInformationValidLifetimeForUpdate = minVLSeconds * time.Second + header.NDPInfiniteLifetime = infiniteVLSeconds * time.Second + + prefix, _, addr := prefixSubnetAddr(0, linkAddr1) + + tests := []struct { + name string + infiniteVL uint32 + }{ + { + name: "EqualToInfiniteVL", + infiniteVL: infiniteVLSeconds, + }, + // Our implementation supports changing header.NDPInfiniteLifetime for tests + // such that a packet can be received where the lifetime field has a value + // greater than header.NDPInfiniteLifetime. Because of this, we test to make + // sure that receiving a value greater than header.NDPInfiniteLifetime is + // handled the same as when receiving a value equal to + // header.NDPInfiniteLifetime. + { + name: "MoreThanInfiniteVL", + infiniteVL: infiniteVLSeconds + 1, + }, + } + + // This Run will not return until the parallel tests finish. + // + // We need this because we need to do some teardown work after the + // parallel tests complete. + // + // See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for + // more details. + t.Run("group", func(t *testing.T) { + for _, test := range tests { + test := test + + t.Run(test.name, func(t *testing.T) { + t.Parallel() + + ndpDisp := ndpDispatcher{ + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), + } + e := channel.New(0, 1280, linkAddr1) + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NDPConfigs: stack.NDPConfigurations{ + HandleRAs: true, + AutoGenGlobalAddresses: true, + }, + NDPDisp: &ndpDisp, + }) + + if err := s.CreateNIC(1, e); err != nil { + t.Fatalf("CreateNIC(1) = %s", err) + } + + // Receive an RA with finite prefix. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0)) + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + + default: + t.Fatal("expected addr auto gen event") + } + + // Receive an new RA with prefix with infinite VL. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.infiniteVL, 0)) + + // Receive a new RA with prefix with finite VL. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0)) + + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + + case <-time.After(minVLSeconds*time.Second + defaultTimeout): + t.Fatal("timeout waiting for addr auto gen event") + } + }) + } + }) +} + +// TestAutoGenAddrValidLifetimeUpdates tests that the valid lifetime of an +// auto-generated address only gets updated when required to, as specified in +// RFC 4862 section 5.5.3.e. +func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) { + const infiniteVL = 4294967295 + const newMinVL = 4 + saved := stack.MinPrefixInformationValidLifetimeForUpdate + defer func() { + stack.MinPrefixInformationValidLifetimeForUpdate = saved + }() + stack.MinPrefixInformationValidLifetimeForUpdate = newMinVL * time.Second + + prefix, _, addr := prefixSubnetAddr(0, linkAddr1) + + tests := []struct { + name string + ovl uint32 + nvl uint32 + evl uint32 + }{ + // Should update the VL to the minimum VL for updating if the + // new VL is less than newMinVL but was originally greater than + // it. + { + "LargeVLToVLLessThanMinVLForUpdate", + 9999, + 1, + newMinVL, + }, + { + "LargeVLTo0", + 9999, + 0, + newMinVL, + }, + { + "InfiniteVLToVLLessThanMinVLForUpdate", + infiniteVL, + 1, + newMinVL, + }, + { + "InfiniteVLTo0", + infiniteVL, + 0, + newMinVL, + }, + + // Should not update VL if original VL was less than newMinVL + // and the new VL is also less than newMinVL. + { + "ShouldNotUpdateWhenBothOldAndNewAreLessThanMinVLForUpdate", + newMinVL - 1, + newMinVL - 3, + newMinVL - 1, + }, + + // Should take the new VL if the new VL is greater than the + // remaining time or is greater than newMinVL. + { + "MorethanMinVLToLesserButStillMoreThanMinVLForUpdate", + newMinVL + 5, + newMinVL + 3, + newMinVL + 3, + }, + { + "SmallVLToGreaterVLButStillLessThanMinVLForUpdate", + newMinVL - 3, + newMinVL - 1, + newMinVL - 1, + }, + { + "SmallVLToGreaterVLThatIsMoreThaMinVLForUpdate", + newMinVL - 3, + newMinVL + 1, + newMinVL + 1, + }, + } + + const delta = 500 * time.Millisecond + + // This Run will not return until the parallel tests finish. + // + // We need this because we need to do some teardown work after the + // parallel tests complete. + // + // See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for + // more details. + t.Run("group", func(t *testing.T) { + for _, test := range tests { + test := test + + t.Run(test.name, func(t *testing.T) { + t.Parallel() + + ndpDisp := ndpDispatcher{ + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10), + } + e := channel.New(10, 1280, linkAddr1) + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NDPConfigs: stack.NDPConfigurations{ + HandleRAs: true, + AutoGenGlobalAddresses: true, + }, + NDPDisp: &ndpDisp, + }) + + if err := s.CreateNIC(1, e); err != nil { + t.Fatalf("CreateNIC(1) = %s", err) + } + + // Receive an RA with prefix with initial VL, + // test.ovl. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.ovl, 0)) + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("expected addr auto gen event") + } + + // Receive an new RA with prefix with new VL, + // test.nvl. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.nvl, 0)) + + // + // Validate that the VL for the address got set + // to test.evl. + // + + // Make sure we do not get any invalidation + // events until atleast 500ms (delta) before + // test.evl. + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly received an auto gen addr event") + case <-time.After(time.Duration(test.evl)*time.Second - delta): + } + + // Wait for another second (2x delta), but now + // we expect the invalidation event. + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + + case <-time.After(2 * delta): + t.Fatal("timeout waiting for addr auto gen event") + } + }) + } + }) +} + +// TestAutoGenAddrRemoval tests that when auto-generated addresses are removed +// by the user, its resources will be cleaned up and an invalidation event will +// be sent to the integrator. +func TestAutoGenAddrRemoval(t *testing.T) { + t.Parallel() + + prefix, _, addr := prefixSubnetAddr(0, linkAddr1) + + ndpDisp := ndpDispatcher{ + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), + } + e := channel.New(0, 1280, linkAddr1) + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NDPConfigs: stack.NDPConfigurations{ + HandleRAs: true, + AutoGenGlobalAddresses: true, + }, + NDPDisp: &ndpDisp, + }) + + if err := s.CreateNIC(1, e); err != nil { + t.Fatalf("CreateNIC(1) = %s", err) + } + + expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { + t.Helper() + + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("expected addr auto gen event") + } + } + + // Receive a PI to auto-generate an address. + const lifetimeSeconds = 1 + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, 0)) + expectAutoGenAddrEvent(addr, newAddr) + + // Removing the address should result in an invalidation event + // immediately. + if err := s.RemoveAddress(1, addr.Address); err != nil { + t.Fatalf("RemoveAddress(_, %s) = %s", addr.Address, err) + } + expectAutoGenAddrEvent(addr, invalidatedAddr) + + // Wait for the original valid lifetime to make sure the original timer + // got stopped/cleaned up. + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly received an auto gen addr event") + case <-time.After(lifetimeSeconds*time.Second + defaultTimeout): + } +} + +// TestAutoGenAddrStaticConflict tests that if SLAAC generates an address that +// is already assigned to the NIC, the static address remains. +func TestAutoGenAddrStaticConflict(t *testing.T) { + t.Parallel() + + prefix, _, addr := prefixSubnetAddr(0, linkAddr1) + + ndpDisp := ndpDispatcher{ + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), + } + e := channel.New(0, 1280, linkAddr1) + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NDPConfigs: stack.NDPConfigurations{ + HandleRAs: true, + AutoGenGlobalAddresses: true, + }, + NDPDisp: &ndpDisp, + }) + + if err := s.CreateNIC(1, e); err != nil { + t.Fatalf("CreateNIC(1) = %s", err) + } + + // Add the address as a static address before SLAAC tries to add it. + if err := s.AddProtocolAddress(1, tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: addr}); err != nil { + t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr.Address, err) + } + if !contains(s.NICInfo()[1].ProtocolAddresses, addr) { + t.Fatalf("Should have %s in the list of addresses", addr1) + } + + // Receive a PI where the generated address will be the same as the one + // that we already have assigned statically. + const lifetimeSeconds = 1 + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, 0)) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly received an auto gen addr event for an address we already have statically") + default: + } + if !contains(s.NICInfo()[1].ProtocolAddresses, addr) { + t.Fatalf("Should have %s in the list of addresses", addr1) + } + + // Should not get an invalidation event after the PI's invalidation + // time. + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly received an auto gen addr event") + case <-time.After(lifetimeSeconds*time.Second + defaultTimeout): + } + if !contains(s.NICInfo()[1].ProtocolAddresses, addr) { + t.Fatalf("Should have %s in the list of addresses", addr1) + } +} + +// TestAutoGenAddrWithOpaqueIID tests that SLAAC generated addresses will use +// opaque interface identifiers when configured to do so. +func TestAutoGenAddrWithOpaqueIID(t *testing.T) { + t.Parallel() + + const nicID = 1 + const nicName = "nic1" + var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte + secretKey := secretKeyBuf[:] + n, err := rand.Read(secretKey) + if err != nil { + t.Fatalf("rand.Read(_): %s", err) + } + if n != header.OpaqueIIDSecretKeyMinBytes { + t.Fatalf("got rand.Read(_) = (%d, _), want = (%d, _)", n, header.OpaqueIIDSecretKeyMinBytes) + } + + prefix1, subnet1, _ := prefixSubnetAddr(0, linkAddr1) + prefix2, subnet2, _ := prefixSubnetAddr(1, linkAddr1) + // addr1 and addr2 are the addresses that are expected to be generated when + // stack.Stack is configured to generate opaque interface identifiers as + // defined by RFC 7217. + addrBytes := []byte(subnet1.ID()) + addr1 := tcpip.AddressWithPrefix{ + Address: tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet1, nicName, 0, secretKey)), + PrefixLen: 64, + } + addrBytes = []byte(subnet2.ID()) + addr2 := tcpip.AddressWithPrefix{ + Address: tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet2, nicName, 0, secretKey)), + PrefixLen: 64, + } + + ndpDisp := ndpDispatcher{ + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), + } + e := channel.New(0, 1280, linkAddr1) + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NDPConfigs: stack.NDPConfigurations{ + HandleRAs: true, + AutoGenGlobalAddresses: true, + }, + NDPDisp: &ndpDisp, + OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{ + NICNameFromID: func(_ tcpip.NICID, nicName string) string { + return nicName + }, + SecretKey: secretKey, + }, + }) + + if err := s.CreateNamedNIC(nicID, nicName, e); err != nil { + t.Fatalf("CreateNamedNIC(%d, %q, _) = %s", nicID, nicName, err) + } + + expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { + t.Helper() + + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("expected addr auto gen event") + } + } + + // Receive an RA with prefix1 in a PI. + const validLifetimeSecondPrefix1 = 1 + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, validLifetimeSecondPrefix1, 0)) + expectAutoGenAddrEvent(addr1, newAddr) + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + t.Fatalf("should have %s in the list of addresses", addr1) + } + + // Receive an RA with prefix2 in a PI with a large valid lifetime. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0)) + expectAutoGenAddrEvent(addr2, newAddr) + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + t.Fatalf("should have %s in the list of addresses", addr1) + } + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + t.Fatalf("should have %s in the list of addresses", addr2) + } + + // Wait for addr of prefix1 to be invalidated. + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + case <-time.After(validLifetimeSecondPrefix1*time.Second + defaultTimeout): + t.Fatal("timed out waiting for addr auto gen event") + } + if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + t.Fatalf("should not have %s in the list of addresses", addr1) + } + if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + t.Fatalf("should have %s in the list of addresses", addr2) + } +} + +// TestNDPRecursiveDNSServerDispatch tests that we properly dispatch an event +// to the integrator when an RA is received with the NDP Recursive DNS Server +// option with at least one valid address. +func TestNDPRecursiveDNSServerDispatch(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + opt header.NDPRecursiveDNSServer + expected *ndpRDNSS + }{ + { + "Unspecified", + header.NDPRecursiveDNSServer([]byte{ + 0, 0, + 0, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }), + nil, + }, + { + "Multicast", + header.NDPRecursiveDNSServer([]byte{ + 0, 0, + 0, 0, 0, 2, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + }), + nil, + }, + { + "OptionTooSmall", + header.NDPRecursiveDNSServer([]byte{ + 0, 0, + 0, 0, 0, 2, + 1, 2, 3, 4, 5, 6, 7, 8, + }), + nil, + }, + { + "0Addresses", + header.NDPRecursiveDNSServer([]byte{ + 0, 0, + 0, 0, 0, 2, + }), + nil, + }, + { + "Valid1Address", + header.NDPRecursiveDNSServer([]byte{ + 0, 0, + 0, 0, 0, 2, + 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1, + }), + &ndpRDNSS{ + []tcpip.Address{ + "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01", + }, + 2 * time.Second, + }, + }, + { + "Valid2Addresses", + header.NDPRecursiveDNSServer([]byte{ + 0, 0, + 0, 0, 0, 1, + 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 2, + }), + &ndpRDNSS{ + []tcpip.Address{ + "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01", + "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x02", + }, + time.Second, + }, + }, + { + "Valid3Addresses", + header.NDPRecursiveDNSServer([]byte{ + 0, 0, + 0, 0, 0, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 2, + 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 3, + }), + &ndpRDNSS{ + []tcpip.Address{ + "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01", + "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x02", + "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x03", + }, + 0, + }, + }, + } + + for _, test := range tests { + test := test + + t.Run(test.name, func(t *testing.T) { + t.Parallel() + + ndpDisp := ndpDispatcher{ + // We do not expect more than a single RDNSS + // event at any time for this test. + rdnssC: make(chan ndpRDNSSEvent, 1), + } + e := channel.New(0, 1280, linkAddr1) + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NDPConfigs: stack.NDPConfigurations{ + HandleRAs: true, + }, + NDPDisp: &ndpDisp, + }) + if err := s.CreateNIC(1, e); err != nil { + t.Fatalf("CreateNIC(1) = %s", err) + } + + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, header.NDPOptionsSerializer{test.opt})) + + if test.expected != nil { + select { + case e := <-ndpDisp.rdnssC: + if e.nicID != 1 { + t.Errorf("got rdnss nicID = %d, want = 1", e.nicID) + } + if diff := cmp.Diff(e.rdnss.addrs, test.expected.addrs); diff != "" { + t.Errorf("rdnss addrs mismatch (-want +got):\n%s", diff) + } + if e.rdnss.lifetime != test.expected.lifetime { + t.Errorf("got rdnss lifetime = %s, want = %s", e.rdnss.lifetime, test.expected.lifetime) + } + default: + t.Fatal("expected an RDNSS option event") + } + } + + // Should have no more RDNSS options. + select { + case e := <-ndpDisp.rdnssC: + t.Fatalf("unexpectedly got a new RDNSS option event: %+v", e) + default: + } + }) + } +} + +// TestCleanupHostOnlyStateOnBecomingRouter tests that all discovered routers +// and prefixes, and auto-generated addresses get invalidated when a NIC +// becomes a router. +func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { + t.Parallel() + + const ( + lifetimeSeconds = 5 + maxEvents = 4 + nicID1 = 1 + nicID2 = 2 + ) + + prefix1, subnet1, e1Addr1 := prefixSubnetAddr(0, linkAddr1) + prefix2, subnet2, e1Addr2 := prefixSubnetAddr(1, linkAddr1) + e2Addr1 := addrForSubnet(subnet1, linkAddr2) + e2Addr2 := addrForSubnet(subnet2, linkAddr2) + + ndpDisp := ndpDispatcher{ + routerC: make(chan ndpRouterEvent, maxEvents), + rememberRouter: true, + prefixC: make(chan ndpPrefixEvent, maxEvents), + rememberPrefix: true, + autoGenAddrC: make(chan ndpAutoGenAddrEvent, maxEvents), + } + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NDPConfigs: stack.NDPConfigurations{ + HandleRAs: true, + DiscoverDefaultRouters: true, + DiscoverOnLinkPrefixes: true, + AutoGenGlobalAddresses: true, + }, + NDPDisp: &ndpDisp, + }) + + e1 := channel.New(0, 1280, linkAddr1) + if err := s.CreateNIC(nicID1, e1); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err) + } + + e2 := channel.New(0, 1280, linkAddr2) + if err := s.CreateNIC(nicID2, e2); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err) + } + + expectRouterEvent := func() (bool, ndpRouterEvent) { + select { + case e := <-ndpDisp.routerC: + return true, e + default: + } + + return false, ndpRouterEvent{} + } + + expectPrefixEvent := func() (bool, ndpPrefixEvent) { + select { + case e := <-ndpDisp.prefixC: + return true, e + default: + } + + return false, ndpPrefixEvent{} + } + + expectAutoGenAddrEvent := func() (bool, ndpAutoGenAddrEvent) { + select { + case e := <-ndpDisp.autoGenAddrC: + return true, e + default: + } + + return false, ndpAutoGenAddrEvent{} + } + + // Receive RAs on NIC(1) and NIC(2) from default routers (llAddr1 and + // llAddr2) w/ PI (for prefix1 in RA from llAddr1 and prefix2 in RA from + // llAddr2) to discover multiple routers and prefixes, and auto-gen + // multiple addresses. + + e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds)) + // We have other tests that make sure we receive the *correct* events + // on normal discovery of routers/prefixes, and auto-generated + // addresses. Here we just make sure we get an event and let other tests + // handle the correctness check. + if ok, _ := expectRouterEvent(); !ok { + t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID1) + } + if ok, _ := expectPrefixEvent(); !ok { + t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1) + } + if ok, _ := expectAutoGenAddrEvent(); !ok { + t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1) + } + + e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds)) + if ok, _ := expectRouterEvent(); !ok { + t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID1) + } + if ok, _ := expectPrefixEvent(); !ok { + t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1) + } + if ok, _ := expectAutoGenAddrEvent(); !ok { + t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1) + } + + e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds)) + if ok, _ := expectRouterEvent(); !ok { + t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID2) + } + if ok, _ := expectPrefixEvent(); !ok { + t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2) + } + if ok, _ := expectAutoGenAddrEvent(); !ok { + t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2) + } + + e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds)) + if ok, _ := expectRouterEvent(); !ok { + t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID2) + } + if ok, _ := expectPrefixEvent(); !ok { + t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2) + } + if ok, _ := expectAutoGenAddrEvent(); !ok { + t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e2Addr2, nicID2) + } + + // We should have the auto-generated addresses added. + nicinfo := s.NICInfo() + nic1Addrs := nicinfo[nicID1].ProtocolAddresses + nic2Addrs := nicinfo[nicID2].ProtocolAddresses + if !contains(nic1Addrs, e1Addr1) { + t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs) + } + if !contains(nic1Addrs, e1Addr2) { + t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs) + } + if !contains(nic2Addrs, e2Addr1) { + t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs) + } + if !contains(nic2Addrs, e2Addr2) { + t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs) + } + + // We can't proceed any further if we already failed the test (missing + // some discovery/auto-generated address events or addresses). + if t.Failed() { + t.FailNow() + } + + s.SetForwarding(true) + + // Collect invalidation events after becoming a router + gotRouterEvents := make(map[ndpRouterEvent]int) + for i := 0; i < maxEvents; i++ { + ok, e := expectRouterEvent() + if !ok { + t.Errorf("expected %d router events after becoming a router; got = %d", maxEvents, i) + break + } + gotRouterEvents[e]++ + } + gotPrefixEvents := make(map[ndpPrefixEvent]int) + for i := 0; i < maxEvents; i++ { + ok, e := expectPrefixEvent() + if !ok { + t.Errorf("expected %d prefix events after becoming a router; got = %d", maxEvents, i) + break + } + gotPrefixEvents[e]++ + } + gotAutoGenAddrEvents := make(map[ndpAutoGenAddrEvent]int) + for i := 0; i < maxEvents; i++ { + ok, e := expectAutoGenAddrEvent() + if !ok { + t.Errorf("expected %d auto-generated address events after becoming a router; got = %d", maxEvents, i) + break + } + gotAutoGenAddrEvents[e]++ + } + + // No need to proceed any further if we already failed the test (missing + // some invalidation events). + if t.Failed() { + t.FailNow() + } + + expectedRouterEvents := map[ndpRouterEvent]int{ + {nicID: nicID1, addr: llAddr1, discovered: false}: 1, + {nicID: nicID1, addr: llAddr2, discovered: false}: 1, + {nicID: nicID2, addr: llAddr1, discovered: false}: 1, + {nicID: nicID2, addr: llAddr2, discovered: false}: 1, + } + if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" { + t.Errorf("router events mismatch (-want +got):\n%s", diff) + } + expectedPrefixEvents := map[ndpPrefixEvent]int{ + {nicID: nicID1, prefix: subnet1, discovered: false}: 1, + {nicID: nicID1, prefix: subnet2, discovered: false}: 1, + {nicID: nicID2, prefix: subnet1, discovered: false}: 1, + {nicID: nicID2, prefix: subnet2, discovered: false}: 1, + } + if diff := cmp.Diff(expectedPrefixEvents, gotPrefixEvents); diff != "" { + t.Errorf("prefix events mismatch (-want +got):\n%s", diff) + } + expectedAutoGenAddrEvents := map[ndpAutoGenAddrEvent]int{ + {nicID: nicID1, addr: e1Addr1, eventType: invalidatedAddr}: 1, + {nicID: nicID1, addr: e1Addr2, eventType: invalidatedAddr}: 1, + {nicID: nicID2, addr: e2Addr1, eventType: invalidatedAddr}: 1, + {nicID: nicID2, addr: e2Addr2, eventType: invalidatedAddr}: 1, + } + if diff := cmp.Diff(expectedAutoGenAddrEvents, gotAutoGenAddrEvents); diff != "" { + t.Errorf("auto-generated address events mismatch (-want +got):\n%s", diff) + } + + // Make sure the auto-generated addresses got removed. + nicinfo = s.NICInfo() + nic1Addrs = nicinfo[nicID1].ProtocolAddresses + nic2Addrs = nicinfo[nicID2].ProtocolAddresses + if contains(nic1Addrs, e1Addr1) { + t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs) + } + if contains(nic1Addrs, e1Addr2) { + t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs) + } + if contains(nic2Addrs, e2Addr1) { + t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs) + } + if contains(nic2Addrs, e2Addr2) { + t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs) + } + + // Should not get any more events (invalidation timers should have been + // cancelled when we transitioned into a router). + time.Sleep(lifetimeSeconds*time.Second + defaultTimeout) + select { + case <-ndpDisp.routerC: + t.Error("unexpected router event") + default: + } + select { + case <-ndpDisp.prefixC: + t.Error("unexpected prefix event") + default: + } + select { + case <-ndpDisp.autoGenAddrC: + t.Error("unexpected auto-generated address event") + default: } } diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 3f8d7312c..4144d5d0f 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -27,11 +27,10 @@ import ( // NIC represents a "network interface card" to which the networking stack is // attached. type NIC struct { - stack *Stack - id tcpip.NICID - name string - linkEP LinkEndpoint - loopback bool + stack *Stack + id tcpip.NICID + name string + linkEP LinkEndpoint mu sync.RWMutex spoofing bool @@ -85,7 +84,7 @@ const ( ) // newNIC returns a new NIC using the default NDP configurations from stack. -func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback bool) *NIC { +func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint) *NIC { // TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For // example, make sure that the link address it provides is a valid // unicast ethernet address. @@ -99,7 +98,6 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback id: id, name: name, linkEP: ep, - loopback: loopback, primary: make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint), endpoints: make(map[NetworkEndpointID]*referencedNetworkEndpoint), mcastJoins: make(map[NetworkEndpointID]int32), @@ -115,10 +113,11 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback }, }, ndp: ndpState{ - configs: stack.ndpConfigs, - dad: make(map[tcpip.Address]dadState), - defaultRouters: make(map[tcpip.Address]defaultRouterState), - onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState), + configs: stack.ndpConfigs, + dad: make(map[tcpip.Address]dadState), + defaultRouters: make(map[tcpip.Address]defaultRouterState), + onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState), + autoGenAddresses: make(map[tcpip.Address]autoGenAddressState), }, } nic.ndp.nic = nic @@ -173,23 +172,28 @@ func (n *NIC) enable() *tcpip.Error { return err } - if !n.stack.autoGenIPv6LinkLocal { + // Do not auto-generate an IPv6 link-local address for loopback devices. + if !n.stack.autoGenIPv6LinkLocal || n.isLoopback() { return nil } - l2addr := n.linkEP.LinkAddress() + var addr tcpip.Address + if oIID := n.stack.opaqueIIDOpts; oIID.NICNameFromID != nil { + addr = header.LinkLocalAddrWithOpaqueIID(oIID.NICNameFromID(n.ID(), n.name), 0, oIID.SecretKey) + } else { + l2addr := n.linkEP.LinkAddress() - // Only attempt to generate the link-local address if we have a - // valid MAC address. - // - // TODO(b/141011931): Validate a LinkEndpoint's link address - // (provided by LinkEndpoint.LinkAddress) before reaching this - // point. - if !header.IsValidUnicastEthernetAddress(l2addr) { - return nil - } + // Only attempt to generate the link-local address if we have a valid MAC + // address. + // + // TODO(b/141011931): Validate a LinkEndpoint's link address (provided by + // LinkEndpoint.LinkAddress) before reaching this point. + if !header.IsValidUnicastEthernetAddress(l2addr) { + return nil + } - addr := header.LinkLocalAddr(l2addr) + addr = header.LinkLocalAddr(l2addr) + } _, err := n.addPermanentAddressLocked(tcpip.ProtocolAddress{ Protocol: header.IPv6ProtocolNumber, @@ -202,6 +206,18 @@ func (n *NIC) enable() *tcpip.Error { return err } +// becomeIPv6Router transitions n into an IPv6 router. +// +// When transitioning into an IPv6 router, host-only state (NDP discovered +// routers, discovered on-link prefixes, and auto-generated addresses) will +// be cleaned up/invalidated. +func (n *NIC) becomeIPv6Router() { + n.mu.Lock() + defer n.mu.Unlock() + + n.ndp.cleanupHostOnlyState() +} + // attachLinkEndpoint attaches the NIC to the endpoint, which will enable it // to start delivering packets. func (n *NIC) attachLinkEndpoint() { @@ -222,6 +238,10 @@ func (n *NIC) isPromiscuousMode() bool { return rv } +func (n *NIC) isLoopback() bool { + return n.linkEP.Capabilities()&CapabilityLoopback != 0 +} + // setSpoofing enables or disables address spoofing. func (n *NIC) setSpoofing(enable bool) { n.mu.Lock() @@ -231,17 +251,61 @@ func (n *NIC) setSpoofing(enable bool) { // primaryEndpoint returns the primary endpoint of n for the given network // protocol. +// +// primaryEndpoint will return the first non-deprecated endpoint if such an +// endpoint exists. If no non-deprecated endpoint exists, the first deprecated +// endpoint will be returned. func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedNetworkEndpoint { n.mu.RLock() defer n.mu.RUnlock() + var deprecatedEndpoint *referencedNetworkEndpoint for _, r := range n.primary[protocol] { - if r.isValidForOutgoing() && r.tryIncRef() { - return r + if !r.isValidForOutgoing() { + continue + } + + if !r.deprecated { + if r.tryIncRef() { + // r is not deprecated, so return it immediately. + // + // If we kept track of a deprecated endpoint, decrement its reference + // count since it was incremented when we decided to keep track of it. + if deprecatedEndpoint != nil { + deprecatedEndpoint.decRefLocked() + deprecatedEndpoint = nil + } + + return r + } + } else if deprecatedEndpoint == nil && r.tryIncRef() { + // We prefer an endpoint that is not deprecated, but we keep track of r in + // case n doesn't have any non-deprecated endpoints. + // + // If we end up finding a more preferred endpoint, r's reference count + // will be decremented when such an endpoint is found. + deprecatedEndpoint = r } } - return nil + // n doesn't have any valid non-deprecated endpoints, so return + // deprecatedEndpoint (which may be nil if n doesn't have any valid deprecated + // endpoints either). + return deprecatedEndpoint +} + +// hasPermanentAddrLocked returns true if n has a permanent (including currently +// tentative) address, addr. +func (n *NIC) hasPermanentAddrLocked(addr tcpip.Address) bool { + ref, ok := n.endpoints[NetworkEndpointID{addr}] + + if !ok { + return false + } + + kind := ref.getKind() + + return kind == permanent || kind == permanentTentative } func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint { @@ -335,7 +399,7 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t Address: address, PrefixLen: netProto.DefaultPrefixLen(), }, - }, peb, temporary) + }, peb, temporary, static, false) n.mu.Unlock() return ref @@ -384,10 +448,10 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p } } - return n.addAddressLocked(protocolAddress, peb, permanent) + return n.addAddressLocked(protocolAddress, peb, permanent, static, false) } -func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind) (*referencedNetworkEndpoint, *tcpip.Error) { +func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) { // TODO(b/141022673): Validate IP address before adding them. // Sanity check. @@ -417,11 +481,13 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar } ref := &referencedNetworkEndpoint{ - refs: 1, - ep: ep, - nic: n, - protocol: protocolAddress.Protocol, - kind: kind, + refs: 1, + ep: ep, + nic: n, + protocol: protocolAddress.Protocol, + kind: kind, + configType: configType, + deprecated: deprecated, } // Set up cache if link address resolution exists for this protocol. @@ -520,6 +586,51 @@ func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress { return addrs } +// primaryAddress returns the primary address associated with this NIC. +// +// primaryAddress will return the first non-deprecated address if such an +// address exists. If no non-deprecated address exists, the first deprecated +// address will be returned. +func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWithPrefix { + n.mu.RLock() + defer n.mu.RUnlock() + + list, ok := n.primary[proto] + if !ok { + return tcpip.AddressWithPrefix{} + } + + var deprecatedEndpoint *referencedNetworkEndpoint + for _, ref := range list { + // Don't include tentative, expired or tempory endpoints to avoid confusion + // and prevent the caller from using those. + switch ref.getKind() { + case permanentTentative, permanentExpired, temporary: + continue + } + + if !ref.deprecated { + return tcpip.AddressWithPrefix{ + Address: ref.ep.ID().LocalAddress, + PrefixLen: ref.ep.PrefixLen(), + } + } + + if deprecatedEndpoint == nil { + deprecatedEndpoint = ref + } + } + + if deprecatedEndpoint != nil { + return tcpip.AddressWithPrefix{ + Address: deprecatedEndpoint.ep.ID().LocalAddress, + PrefixLen: deprecatedEndpoint.ep.PrefixLen(), + } + } + + return tcpip.AddressWithPrefix{} +} + // AddAddressRange adds a range of addresses to n, so that it starts accepting // packets targeted at the given addresses and network protocol. The range is // given by a subnet address, and all addresses contained in the subnet are @@ -624,9 +735,18 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error { isIPv6Unicast := r.protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(addr) - // If we are removing a tentative IPv6 unicast address, stop DAD. - if isIPv6Unicast && kind == permanentTentative { - n.ndp.stopDuplicateAddressDetection(addr) + if isIPv6Unicast { + // If we are removing a tentative IPv6 unicast address, stop + // DAD. + if kind == permanentTentative { + n.ndp.stopDuplicateAddressDetection(addr) + } + + // If we are removing an address generated via SLAAC, cleanup + // its SLAAC resources and notify the integrator. + if r.configType == slaac { + n.ndp.cleanupAutoGenAddrResourcesAndNotify(addr) + } } r.setKind(permanentExpired) @@ -989,7 +1109,7 @@ const ( // removing the permanent address from the NIC. permanent - // An expired permanent endoint is a permanent endoint that had its address + // An expired permanent endpoint is a permanent endpoint that had its address // removed from the NIC, and it is waiting to be removed once no more routes // hold a reference to it. This is achieved by decreasing its reference count // by 1. If its address is re-added before the endpoint is removed, its type @@ -1035,6 +1155,19 @@ func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep } } +type networkEndpointConfigType int32 + +const ( + // A statically configured endpoint is an address that was added by + // some user-specified action (adding an explicit address, joining a + // multicast group). + static networkEndpointConfigType = iota + + // A slaac configured endpoint is an IPv6 endpoint that was + // added by SLAAC as per RFC 4862 section 5.5.3. + slaac +) + type referencedNetworkEndpoint struct { ep NetworkEndpoint nic *NIC @@ -1050,6 +1183,15 @@ type referencedNetworkEndpoint struct { // networkEndpointKind must only be accessed using {get,set}Kind(). kind networkEndpointKind + + // configType is the method that was used to configure this endpoint. + // This must never change after the endpoint is added to a NIC. + configType networkEndpointConfigType + + // deprecated indicates whether or not the endpoint should be considered + // deprecated. That is, when deprecated is true, other endpoints that are not + // deprecated should be preferred. + deprecated bool } func (r *referencedNetworkEndpoint) getKind() networkEndpointKind { diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go index 61fd46d66..2b8751d49 100644 --- a/pkg/tcpip/stack/registration.go +++ b/pkg/tcpip/stack/registration.go @@ -234,15 +234,15 @@ type NetworkEndpoint interface { // WritePacket writes a packet to the given destination address and // protocol. It sets pkt.NetworkHeader. pkt.TransportHeader must have // already been set. - WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, loop PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error + WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error // WritePackets writes packets to the given destination address and // protocol. pkts must not be zero length. - WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams, loop PacketLooping) (int, *tcpip.Error) + WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) // WriteHeaderIncludedPacket writes a packet that includes a network // header to the given destination address. - WriteHeaderIncludedPacket(r *Route, loop PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error + WriteHeaderIncludedPacket(r *Route, pkt tcpip.PacketBuffer) *tcpip.Error // ID returns the network protocol endpoint ID. ID() *NetworkEndpointID diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go index 34307ae07..517f4b941 100644 --- a/pkg/tcpip/stack/route.go +++ b/pkg/tcpip/stack/route.go @@ -158,7 +158,7 @@ func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt tcpip.Pack return tcpip.ErrInvalidEndpointState } - err := r.ref.ep.WritePacket(r, gso, params, r.Loop, pkt) + err := r.ref.ep.WritePacket(r, gso, params, pkt) if err != nil { r.Stats().IP.OutgoingPacketErrors.Increment() } else { @@ -174,7 +174,7 @@ func (r *Route) WritePackets(gso *GSO, pkts []tcpip.PacketBuffer, params Network return 0, tcpip.ErrInvalidEndpointState } - n, err := r.ref.ep.WritePackets(r, gso, pkts, params, r.Loop) + n, err := r.ref.ep.WritePackets(r, gso, pkts, params) if err != nil { r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(len(pkts) - n)) } @@ -195,7 +195,7 @@ func (r *Route) WriteHeaderIncludedPacket(pkt tcpip.PacketBuffer) *tcpip.Error { return tcpip.ErrInvalidEndpointState } - if err := r.ref.ep.WriteHeaderIncludedPacket(r, r.Loop, pkt); err != nil { + if err := r.ref.ep.WriteHeaderIncludedPacket(r, pkt); err != nil { r.Stats().IP.OutgoingPacketErrors.Increment() return err } diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 0e88643a4..807f910f6 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -352,6 +352,38 @@ func (u *uniqueIDGenerator) UniqueID() uint64 { return atomic.AddUint64((*uint64)(u), 1) } +// NICNameFromID is a function that returns a stable name for the specified NIC, +// even if different NIC IDs are used to refer to the same NIC in different +// program runs. It is used when generating opaque interface identifiers (IIDs). +// If the NIC was created with a name, it will be passed to NICNameFromID. +// +// NICNameFromID SHOULD return unique NIC names so unique opaque IIDs are +// generated for the same prefix on differnt NICs. +type NICNameFromID func(tcpip.NICID, string) string + +// OpaqueInterfaceIdentifierOptions holds the options related to the generation +// of opaque interface indentifiers (IIDs) as defined by RFC 7217. +type OpaqueInterfaceIdentifierOptions struct { + // NICNameFromID is a function that returns a stable name for a specified NIC, + // even if the NIC ID changes over time. + // + // Must be specified to generate the opaque IID. + NICNameFromID NICNameFromID + + // SecretKey is a pseudo-random number used as the secret key when generating + // opaque IIDs as defined by RFC 7217. The key SHOULD be at least + // header.OpaqueIIDSecretKeyMinBytes bytes and MUST follow minimum randomness + // requirements for security as outlined by RFC 4086. SecretKey MUST NOT + // change between program runs, unless explicitly changed. + // + // OpaqueInterfaceIdentifierOptions takes ownership of SecretKey. SecretKey + // MUST NOT be modified after Stack is created. + // + // May be nil, but a nil value is highly discouraged to maintain + // some level of randomness between nodes. + SecretKey []byte +} + // Stack is a networking stack, with all supported protocols, NICs, and route // table. type Stack struct { @@ -412,8 +444,8 @@ type Stack struct { ndpConfigs NDPConfigurations // autoGenIPv6LinkLocal determines whether or not the stack will attempt - // to auto-generate an IPv6 link-local address for newly enabled NICs. - // See the AutoGenIPv6LinkLocal field of Options for more details. + // to auto-generate an IPv6 link-local address for newly enabled non-loopback + // NICs. See the AutoGenIPv6LinkLocal field of Options for more details. autoGenIPv6LinkLocal bool // ndpDisp is the NDP event dispatcher that is used to send the netstack @@ -422,6 +454,10 @@ type Stack struct { // uniqueIDGenerator is a generator of unique identifiers. uniqueIDGenerator UniqueID + + // opaqueIIDOpts hold the options for generating opaque interface identifiers + // (IIDs) as outlined by RFC 7217. + opaqueIIDOpts OpaqueInterfaceIdentifierOptions } // UniqueID is an abstract generator of unique identifiers. @@ -460,13 +496,15 @@ type Options struct { // before assigning an address to a NIC. NDPConfigs NDPConfigurations - // AutoGenIPv6LinkLocal determins whether or not the stack will attempt - // to auto-generate an IPv6 link-local address for newly enabled NICs. + // AutoGenIPv6LinkLocal determines whether or not the stack will attempt to + // auto-generate an IPv6 link-local address for newly enabled non-loopback + // NICs. + // // Note, setting this to true does not mean that a link-local address - // will be assigned right away, or at all. If Duplicate Address - // Detection is enabled, an address will only be assigned if it - // successfully resolves. If it fails, no further attempt will be made - // to auto-generate an IPv6 link-local address. + // will be assigned right away, or at all. If Duplicate Address Detection + // is enabled, an address will only be assigned if it successfully resolves. + // If it fails, no further attempt will be made to auto-generate an IPv6 + // link-local address. // // The generated link-local address will follow RFC 4291 Appendix A // guidelines. @@ -479,6 +517,10 @@ type Options struct { // RawFactory produces raw endpoints. Raw endpoints are enabled only if // this is non-nil. RawFactory RawFactory + + // OpaqueIIDOpts hold the options for generating opaque interface identifiers + // (IIDs) as outlined by RFC 7217. + OpaqueIIDOpts OpaqueInterfaceIdentifierOptions } // TransportEndpointInfo holds useful information about a transport endpoint @@ -549,6 +591,7 @@ func New(opts Options) *Stack { autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal, uniqueIDGenerator: opts.UniqueID, ndpDisp: opts.NDPDisp, + opaqueIIDOpts: opts.OpaqueIIDOpts, } // Add specified network protocols. @@ -662,11 +705,31 @@ func (s *Stack) Stats() tcpip.Stats { } // SetForwarding enables or disables the packet forwarding between NICs. +// +// When forwarding becomes enabled, any host-only state on all NICs will be +// cleaned up. func (s *Stack) SetForwarding(enable bool) { // TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward. s.mu.Lock() + defer s.mu.Unlock() + + // If forwarding status didn't change, do nothing further. + if s.forwarding == enable { + return + } + s.forwarding = enable - s.mu.Unlock() + + // If this stack does not support IPv6, do nothing further. + if _, ok := s.networkProtocols[header.IPv6ProtocolNumber]; !ok { + return + } + + if enable { + for _, nic := range s.nics { + nic.becomeIPv6Router() + } + } } // Forwarding returns if the packet forwarding between NICs is enabled. @@ -735,7 +798,7 @@ func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNum // createNIC creates a NIC with the provided id and link-layer endpoint, and // optionally enable it. -func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled, loopback bool) *tcpip.Error { +func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled bool) *tcpip.Error { s.mu.Lock() defer s.mu.Unlock() @@ -744,7 +807,7 @@ func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled, return tcpip.ErrDuplicateNICID } - n := newNIC(s, id, name, ep, loopback) + n := newNIC(s, id, name, ep) s.nics[id] = n if enabled { @@ -756,32 +819,26 @@ func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled, // CreateNIC creates a NIC with the provided id and link-layer endpoint. func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error { - return s.createNIC(id, "", ep, true, false) + return s.createNIC(id, "", ep, true) } // CreateNamedNIC creates a NIC with the provided id and link-layer endpoint, // and a human-readable name. func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error { - return s.createNIC(id, name, ep, true, false) -} - -// CreateNamedLoopbackNIC creates a NIC with the provided id and link-layer -// endpoint, and a human-readable name. -func (s *Stack) CreateNamedLoopbackNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error { - return s.createNIC(id, name, ep, true, true) + return s.createNIC(id, name, ep, true) } // CreateDisabledNIC creates a NIC with the provided id and link-layer endpoint, // but leave it disable. Stack.EnableNIC must be called before the link-layer // endpoint starts delivering packets to it. func (s *Stack) CreateDisabledNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error { - return s.createNIC(id, "", ep, false, false) + return s.createNIC(id, "", ep, false) } // CreateDisabledNamedNIC is a combination of CreateNamedNIC and // CreateDisabledNIC. func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error { - return s.createNIC(id, name, ep, false, false) + return s.createNIC(id, name, ep, false) } // EnableNIC enables the given NIC so that the link-layer endpoint can start @@ -848,7 +905,7 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo { Up: true, // Netstack interfaces are always up. Running: nic.linkEP.IsAttached(), Promiscuous: nic.isPromiscuousMode(), - Loopback: nic.linkEP.Capabilities()&CapabilityLoopback != 0, + Loopback: nic.isLoopback(), } nics[id] = NICInfo{ Name: nic.name, @@ -973,9 +1030,11 @@ func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress { return nics } -// GetMainNICAddress returns the first primary address and prefix for the given -// NIC and protocol. Returns an error if the NIC doesn't exist and an empty -// value if the NIC doesn't have a primary address for the given protocol. +// GetMainNICAddress returns the first non-deprecated primary address and prefix +// for the given NIC and protocol. If no non-deprecated primary address exists, +// a deprecated primary address and prefix will be returned. Returns an error if +// the NIC doesn't exist and an empty value if the NIC doesn't have a primary +// address for the given protocol. func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, *tcpip.Error) { s.mu.RLock() defer s.mu.RUnlock() @@ -985,12 +1044,7 @@ func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocol return tcpip.AddressWithPrefix{}, tcpip.ErrUnknownNICID } - for _, a := range nic.PrimaryAddresses() { - if a.Protocol == protocol { - return a.AddressWithPrefix, nil - } - } - return tcpip.AddressWithPrefix{}, nil + return nic.primaryAddress(protocol), nil } func (s *Stack) getRefEP(nic *NIC, localAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) { @@ -1012,7 +1066,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n if id != 0 && !needRoute { if nic, ok := s.nics[id]; ok { if ref := s.getRefEP(nic, localAddr, netProto); ref != nil { - return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback), nil + return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil } } } else { @@ -1028,7 +1082,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n remoteAddr = ref.ep.ID().LocalAddress } - r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback) + r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()) if needRoute { r.NextHop = route.Gateway } diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index 8fc034ca1..33f20579f 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -27,10 +27,12 @@ import ( "time" "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/channel" + "gvisor.dev/gvisor/pkg/tcpip/link/loopback" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -122,7 +124,7 @@ func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities { return f.ep.Capabilities() } -func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error { +func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error { // Increment the sent packet count in the protocol descriptor. f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++ @@ -133,7 +135,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params b[1] = f.id.LocalAddress[0] b[2] = byte(params.Protocol) - if loop&stack.PacketLoop != 0 { + if r.Loop&stack.PacketLoop != 0 { views := make([]buffer.View, 1, 1+len(pkt.Data.Views())) views[0] = pkt.Header.View() views = append(views, pkt.Data.Views()...) @@ -141,7 +143,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views), }) } - if loop&stack.PacketOut == 0 { + if r.Loop&stack.PacketOut == 0 { return nil } @@ -149,11 +151,11 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params } // WritePackets implements stack.LinkEndpoint.WritePackets. -func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) { +func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) { panic("not implemented") } -func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error { +func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error { return tcpip.ErrNotSupported } @@ -1894,55 +1896,67 @@ func TestNICForwarding(t *testing.T) { } // TestNICAutoGenAddr tests the auto-generation of IPv6 link-local addresses -// (or lack there-of if disabled (default)). Note, DAD will be disabled in -// these tests. +// using the modified EUI-64 of the NIC's MAC address (or lack there-of if +// disabled (default)). Note, DAD will be disabled in these tests. func TestNICAutoGenAddr(t *testing.T) { tests := []struct { name string autoGen bool linkAddr tcpip.LinkAddress + iidOpts stack.OpaqueInterfaceIdentifierOptions shouldGen bool }{ { "Disabled", false, linkAddr1, + stack.OpaqueInterfaceIdentifierOptions{ + NICNameFromID: func(nicID tcpip.NICID, _ string) string { + return fmt.Sprintf("nic%d", nicID) + }, + }, false, }, { "Enabled", true, linkAddr1, + stack.OpaqueInterfaceIdentifierOptions{}, true, }, { "Nil MAC", true, tcpip.LinkAddress([]byte(nil)), + stack.OpaqueInterfaceIdentifierOptions{}, false, }, { "Empty MAC", true, tcpip.LinkAddress(""), + stack.OpaqueInterfaceIdentifierOptions{}, false, }, { "Invalid MAC", true, tcpip.LinkAddress("\x01\x02\x03"), + stack.OpaqueInterfaceIdentifierOptions{}, false, }, { "Multicast MAC", true, tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"), + stack.OpaqueInterfaceIdentifierOptions{}, false, }, { "Unspecified MAC", true, tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"), + stack.OpaqueInterfaceIdentifierOptions{}, false, }, } @@ -1951,13 +1965,12 @@ func TestNICAutoGenAddr(t *testing.T) { t.Run(test.name, func(t *testing.T) { opts := stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + OpaqueIIDOpts: test.iidOpts, } if test.autoGen { - // Only set opts.AutoGenIPv6LinkLocal when - // test.autoGen is true because - // opts.AutoGenIPv6LinkLocal should be false by - // default. + // Only set opts.AutoGenIPv6LinkLocal when test.autoGen is true because + // opts.AutoGenIPv6LinkLocal should be false by default. opts.AutoGenIPv6LinkLocal = true } @@ -1973,9 +1986,130 @@ func TestNICAutoGenAddr(t *testing.T) { } if test.shouldGen { + // Should have auto-generated an address and resolved immediately (DAD + // is disabled). + if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(test.linkAddr), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want { + t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want) + } + } else { + // Should not have auto-generated an address. + if want := (tcpip.AddressWithPrefix{}); addr != want { + t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want) + } + } + }) + } +} + +// TestNICAutoGenAddrWithOpaque tests the auto-generation of IPv6 link-local +// addresses with opaque interface identifiers. Link Local addresses should +// always be generated with opaque IIDs if configured to use them, even if the +// NIC has an invalid MAC address. +func TestNICAutoGenAddrWithOpaque(t *testing.T) { + const nicID = 1 + + var secretKey [header.OpaqueIIDSecretKeyMinBytes]byte + n, err := rand.Read(secretKey[:]) + if err != nil { + t.Fatalf("rand.Read(_): %s", err) + } + if n != header.OpaqueIIDSecretKeyMinBytes { + t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", header.OpaqueIIDSecretKeyMinBytes, n) + } + + tests := []struct { + name string + nicName string + autoGen bool + linkAddr tcpip.LinkAddress + secretKey []byte + }{ + { + name: "Disabled", + nicName: "nic1", + autoGen: false, + linkAddr: linkAddr1, + secretKey: secretKey[:], + }, + { + name: "Enabled", + nicName: "nic1", + autoGen: true, + linkAddr: linkAddr1, + secretKey: secretKey[:], + }, + // These are all cases where we would not have generated a + // link-local address if opaque IIDs were disabled. + { + name: "Nil MAC and empty nicName", + nicName: "", + autoGen: true, + linkAddr: tcpip.LinkAddress([]byte(nil)), + secretKey: secretKey[:1], + }, + { + name: "Empty MAC and empty nicName", + autoGen: true, + linkAddr: tcpip.LinkAddress(""), + secretKey: secretKey[:2], + }, + { + name: "Invalid MAC", + nicName: "test", + autoGen: true, + linkAddr: tcpip.LinkAddress("\x01\x02\x03"), + secretKey: secretKey[:3], + }, + { + name: "Multicast MAC", + nicName: "test2", + autoGen: true, + linkAddr: tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"), + secretKey: secretKey[:4], + }, + { + name: "Unspecified MAC and nil SecretKey", + nicName: "test3", + autoGen: true, + linkAddr: tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"), + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + opts := stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{ + NICNameFromID: func(_ tcpip.NICID, nicName string) string { + return nicName + }, + SecretKey: test.secretKey, + }, + } + + if test.autoGen { + // Only set opts.AutoGenIPv6LinkLocal when + // test.autoGen is true because + // opts.AutoGenIPv6LinkLocal should be false by + // default. + opts.AutoGenIPv6LinkLocal = true + } + + e := channel.New(10, 1280, test.linkAddr) + s := stack.New(opts) + if err := s.CreateNamedNIC(nicID, test.nicName, e); err != nil { + t.Fatalf("CreateNamedNIC(%d, %q, _) = %s", nicID, test.nicName, err) + } + + addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) + if err != nil { + t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err) + } + + if test.autoGen { // Should have auto-generated an address and // resolved immediately (DAD is disabled). - if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(test.linkAddr), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want { + if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddrWithOpaqueIID(test.nicName, 0, test.secretKey), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want { t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want) } } else { @@ -1988,6 +2122,55 @@ func TestNICAutoGenAddr(t *testing.T) { } } +// TestNoLinkLocalAutoGenForLoopbackNIC tests that IPv6 link-local addresses are +// not auto-generated for loopback NICs. +func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) { + const nicID = 1 + const nicName = "nicName" + + tests := []struct { + name string + opaqueIIDOpts stack.OpaqueInterfaceIdentifierOptions + }{ + { + name: "IID From MAC", + opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{}, + }, + { + name: "Opaque IID", + opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{ + NICNameFromID: func(_ tcpip.NICID, nicName string) string { + return nicName + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + opts := stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + AutoGenIPv6LinkLocal: true, + OpaqueIIDOpts: test.opaqueIIDOpts, + } + + e := loopback.New() + s := stack.New(opts) + if err := s.CreateNamedNIC(nicID, nicName, e); err != nil { + t.Fatalf("CreateNamedNIC(%d, %q, _) = %s", nicID, nicName, err) + } + + addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) + if err != nil { + t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err) + } + if want := (tcpip.AddressWithPrefix{}); addr != want { + t.Errorf("got stack.GetMainNICAddress(%d, _) = %s, want = %s", nicID, addr, want) + } + }) + } +} + // TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6 // link-local addresses will only be assigned after the DAD process resolves. func TestNICAutoGenAddrDoesDAD(t *testing.T) { diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index bd5eb89ca..f62fd729f 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -308,7 +308,7 @@ type ControlMessages struct { // HasTimestamp indicates whether Timestamp is valid/set. HasTimestamp bool - // Timestamp is the time (in ns) that the last packed used to create + // Timestamp is the time (in ns) that the last packet used to create // the read data was received. Timestamp int64 @@ -317,6 +317,18 @@ type ControlMessages struct { // Inq is the number of bytes ready to be received. Inq int32 + + // HasTOS indicates whether Tos is valid/set. + HasTOS bool + + // TOS is the IPv4 type of service of the associated packet. + TOS int8 + + // HasTClass indicates whether Tclass is valid/set. + HasTClass bool + + // Tclass is the IPv6 traffic class of the associated packet. + TClass int32 } // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp) @@ -564,6 +576,11 @@ type KeepaliveIntervalOption time.Duration // closed. type KeepaliveCountOption int +// TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user +// specified timeout for a given TCP connection. +// See: RFC5482 for details. +type TCPUserTimeoutOption time.Duration + // CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get // the current congestion control algorithm. type CongestionControlOption string @@ -912,6 +929,14 @@ type TCPStats struct { // ESTABLISHED state or the CLOSE-WAIT state. EstablishedResets *StatCounter + // EstablishedClosed is the number of times established TCP connections + // made a transition to CLOSED state. + EstablishedClosed *StatCounter + + // EstablishedTimedout is the number of times an established connection + // was reset because of keep-alive time out. + EstablishedTimedout *StatCounter + // ListenOverflowSynDrop is the number of times the listen queue overflowed // and a SYN was dropped. ListenOverflowSynDrop *StatCounter diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index dd1728f9c..3b353d56c 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -28,6 +28,7 @@ go_library( "forwarder.go", "protocol.go", "rcv.go", + "rcv_state.go", "reno.go", "sack.go", "sack_scoreboard.go", @@ -52,6 +53,7 @@ go_library( "//pkg/tcpip/hash/jenkins", "//pkg/tcpip/header", "//pkg/tcpip/iptables", + "//pkg/tcpip/ports", "//pkg/tcpip/seqnum", "//pkg/tcpip/stack", "//pkg/tcpip/transport/raw", diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index f543a6105..5422ae80c 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -242,6 +242,13 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i n.initGSO() + // Now inherit any socket options that should be inherited from the + // listening endpoint. + // In case of Forwarder listenEP will be nil and hence this check. + if l.listenEP != nil { + l.listenEP.propagateInheritableOptions(n) + } + // Register new endpoint so that packets are routed to it. if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil { n.Close() @@ -298,8 +305,6 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head return nil, err } ep.mu.Lock() - ep.stack.Stats().TCP.CurrentEstablished.Increment() - ep.state = StateEstablished ep.isConnectNotified = true ep.mu.Unlock() @@ -352,6 +357,14 @@ func (e *endpoint) deliverAccepted(n *endpoint) { } } +// propagateInheritableOptions propagates any options set on the listening +// endpoint to the newly created endpoint. +func (e *endpoint) propagateInheritableOptions(n *endpoint) { + e.mu.Lock() + n.userTimeout = e.userTimeout + e.mu.Unlock() +} + // handleSynSegment is called in its own goroutine once the listening endpoint // receives a SYN segment. It is responsible for completing the handshake and // queueing the new endpoint for acceptance. @@ -546,6 +559,8 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) { n.tsOffset = 0 // Switch state to connected. + // We do not use transitionToStateEstablishedLocked here as there is + // no handshake state available when doing a SYN cookie based accept. n.stack.Stats().TCP.CurrentEstablished.Increment() n.state = StateEstablished n.isConnectNotified = true diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 4206db8b6..cdd69f360 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -218,6 +218,14 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error { // acceptable if the ack field acknowledges the SYN. if s.flagIsSet(header.TCPFlagRst) { if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == h.iss+1 { + // RFC 793, page 67, states that "If the RST bit is set [and] If the ACK + // was acceptable then signal the user "error: connection reset", drop + // the segment, enter CLOSED state, delete TCB, and return." + h.ep.mu.Lock() + h.ep.workerCleanup = true + h.ep.mu.Unlock() + // Although the RFC above calls out ECONNRESET, Linux actually returns + // ECONNREFUSED here so we do as well. return tcpip.ErrConnectionRefused } return nil @@ -252,6 +260,11 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error { // and the handshake is completed. if s.flagIsSet(header.TCPFlagAck) { h.state = handshakeCompleted + + h.ep.mu.Lock() + h.ep.transitionToStateEstablishedLocked(h) + h.ep.mu.Unlock() + h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale()) return nil } @@ -352,6 +365,10 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error { h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber) } h.state = handshakeCompleted + h.ep.mu.Lock() + h.ep.transitionToStateEstablishedLocked(h) + h.ep.mu.Unlock() + return nil } @@ -853,7 +870,7 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) { } e.state = StateError e.HardError = err - if err != tcpip.ErrConnectionReset { + if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout { // The exact sequence number to be used for the RST is the same as the // one used by Linux. We need to handle the case of window being shrunk // which can cause sndNxt to be outside the acceptable window on the @@ -880,6 +897,30 @@ func (e *endpoint) completeWorkerLocked() { } } +// transitionToStateEstablisedLocked transitions a given endpoint +// to an established state using the handshake parameters provided. +// It also initializes sender/receiver if required. +func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) { + if e.snd == nil { + // Transfer handshake state to TCP connection. We disable + // receive window scaling if the peer doesn't support it + // (indicated by a negative send window scale). + e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale) + } + if e.rcv == nil { + rcvBufSize := seqnum.Size(e.receiveBufferSize()) + e.rcvListMu.Lock() + e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize) + // Bootstrap the auto tuning algorithm. Starting at zero will + // result in a really large receive window after the first auto + // tuning adjustment. + e.rcvAutoParams.prevCopied = int(h.rcvWnd) + e.rcvListMu.Unlock() + } + h.ep.stack.Stats().TCP.CurrentEstablished.Increment() + e.state = StateEstablished +} + // transitionToStateCloseLocked ensures that the endpoint is // cleaned up from the transport demuxer, "before" moving to // StateClose. This will ensure that no packet will be @@ -891,6 +932,7 @@ func (e *endpoint) transitionToStateCloseLocked() { } e.cleanupLocked() e.state = StateClose + e.stack.Stats().TCP.EstablishedClosed.Increment() } // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed @@ -953,20 +995,6 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) { func (e *endpoint) handleSegments() *tcpip.Error { checkRequeue := true for i := 0; i < maxSegmentsPerWake; i++ { - e.mu.RLock() - state := e.state - e.mu.RUnlock() - if state == StateClose { - // When we get into StateClose while processing from the queue, - // return immediately and let the protocolMainloop handle it. - // - // We can reach StateClose only while processing a previous segment - // or a notification from the protocolMainLoop (caller goroutine). - // This means that with this return, the segment dequeue below can - // never occur on a closed endpoint. - return nil - } - s := e.segmentQueue.dequeue() if s == nil { checkRequeue = false @@ -1024,6 +1052,24 @@ func (e *endpoint) handleSegments() *tcpip.Error { s.decRef() continue } + + // Now check if the received segment has caused us to transition + // to a CLOSED state, if yes then terminate processing and do + // not invoke the sender. + e.mu.RLock() + state := e.state + e.mu.RUnlock() + if state == StateClose { + // When we get into StateClose while processing from the queue, + // return immediately and let the protocolMainloop handle it. + // + // We can reach StateClose only while processing a previous segment + // or a notification from the protocolMainLoop (caller goroutine). + // This means that with this return, the segment dequeue below can + // never occur on a closed endpoint. + s.decRef() + return nil + } e.snd.handleRcvdSegment(s) } s.decRef() @@ -1049,14 +1095,27 @@ func (e *endpoint) handleSegments() *tcpip.Error { // keepalive packets periodically when the connection is idle. If we don't hear // from the other side after a number of tries, we terminate the connection. func (e *endpoint) keepaliveTimerExpired() *tcpip.Error { + e.mu.RLock() + userTimeout := e.userTimeout + e.mu.RUnlock() + e.keepalive.Lock() if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() { e.keepalive.Unlock() return nil } + // If a userTimeout is set then abort the connection if it is + // exceeded. + if userTimeout != 0 && time.Since(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 { + e.keepalive.Unlock() + e.stack.Stats().TCP.EstablishedTimedout.Increment() + return tcpip.ErrTimeout + } + if e.keepalive.unacked >= e.keepalive.count { e.keepalive.Unlock() + e.stack.Stats().TCP.EstablishedTimedout.Increment() return tcpip.ErrTimeout } @@ -1073,7 +1132,6 @@ func (e *endpoint) keepaliveTimerExpired() *tcpip.Error { // whether it is enabled for this endpoint. func (e *endpoint) resetKeepaliveTimer(receivedData bool) { e.keepalive.Lock() - defer e.keepalive.Unlock() if receivedData { e.keepalive.unacked = 0 } @@ -1081,6 +1139,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) { // data to send. if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt { e.keepalive.timer.disable() + e.keepalive.Unlock() return } if e.keepalive.unacked > 0 { @@ -1088,6 +1147,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) { } else { e.keepalive.timer.enable(e.keepalive.idle) } + e.keepalive.Unlock() } // disableKeepaliveTimer stops the keepalive timer. @@ -1142,8 +1202,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error { e.lastErrorMu.Unlock() e.mu.Lock() - e.stack.Stats().TCP.EstablishedResets.Increment() - e.stack.Stats().TCP.CurrentEstablished.Decrement() e.state = StateError e.HardError = err @@ -1152,25 +1210,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error { return err } - - // Transfer handshake state to TCP connection. We disable - // receive window scaling if the peer doesn't support it - // (indicated by a negative send window scale). - e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale) - - rcvBufSize := seqnum.Size(e.receiveBufferSize()) - e.rcvListMu.Lock() - e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize) - // boot strap the auto tuning algorithm. Starting at zero will - // result in a large step function on the first proper causing - // the window to just go to a really large value after the first - // RTT itself. - e.rcvAutoParams.prevCopied = initialRcvWnd - e.rcvListMu.Unlock() - e.stack.Stats().TCP.CurrentEstablished.Increment() - e.mu.Lock() - e.state = StateEstablished - e.mu.Unlock() } e.keepalive.timer.init(&e.keepalive.waker) @@ -1221,6 +1260,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error { w: &e.snd.resendWaker, f: func() *tcpip.Error { if !e.snd.retransmitTimerExpired() { + e.stack.Stats().TCP.EstablishedTimedout.Increment() return tcpip.ErrTimeout } return nil @@ -1371,7 +1411,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error { // Mark endpoint as closed. e.mu.Lock() if e.state != StateError { - e.stack.Stats().TCP.EstablishedResets.Increment() e.stack.Stats().TCP.CurrentEstablished.Decrement() e.transitionToStateCloseLocked() } @@ -1388,6 +1427,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error { if s == nil { break } + e.tryDeliverSegmentFromClosedEndpoint(s) } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 9d4a87e30..fe629aa40 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -30,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/tcpip/ports" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tmutex" @@ -340,9 +341,11 @@ type endpoint struct { // TCP should never broadcast but Linux nevertheless supports enabling/ // disabling SO_BROADCAST, albeit as a NOOP. broadcast bool + // Values used to reserve a port or register a transport endpoint // (which ever happens first). boundBindToDevice tcpip.NICID + boundPortFlags ports.Flags // effectiveNetProtos contains the network protocols actually in use. In // most cases it will only contain "netProto", but in cases like IPv6 @@ -472,6 +475,12 @@ type endpoint struct { // without hearing a response, the connection is closed. keepalive keepalive + // userTimeout if non-zero specifies a user specified timeout for + // a connection w/ pending data to send. A connection that has pending + // unacked data will be forcibily aborted if the timeout is reached + // without any data being acked. + userTimeout time.Duration + // pendingAccepted is a synchronization primitive used to track number // of connections that are queued up to be delivered to the accepted // channel. We use this to ensure that all goroutines blocked on writing @@ -737,9 +746,10 @@ func (e *endpoint) Close() { e.isRegistered = false } - e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice) + e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice) e.isPortReserved = false e.boundBindToDevice = 0 + e.boundPortFlags = ports.Flags{} } // Mark endpoint as closed. @@ -800,10 +810,11 @@ func (e *endpoint) cleanupLocked() { } if e.isPortReserved { - e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice) + e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice) e.isPortReserved = false } e.boundBindToDevice = 0 + e.boundPortFlags = ports.Flags{} e.route.Release() e.stack.CompleteTransportEndpointCleanup(e) @@ -1329,6 +1340,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { e.notifyProtocolGoroutine(notifyKeepaliveChanged) return nil + case tcpip.TCPUserTimeoutOption: + e.mu.Lock() + e.userTimeout = time.Duration(v) + e.mu.Unlock() + return nil + case tcpip.BroadcastOption: e.mu.Lock() e.broadcast = v != 0 @@ -1587,6 +1604,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { e.keepalive.Unlock() return nil + case *tcpip.TCPUserTimeoutOption: + e.mu.Lock() + *o = tcpip.TCPUserTimeoutOption(e.userTimeout) + e.mu.Unlock() + return nil + case *tcpip.OutOfBandInlineOption: // We don't currently support disabling this option. *o = 1 @@ -1775,7 +1798,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc } // reusePort is false below because connect cannot reuse a port even if // reusePort was set. - if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, false /* reusePort */, e.bindToDevice) { + if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, ports.Flags{LoadBalanced: false}, e.bindToDevice) { return false, nil } @@ -1802,7 +1825,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc // before Connect: in such a case we don't want to hold on to // reservations anymore. if e.isPortReserved { - e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundBindToDevice) + e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundPortFlags, e.boundBindToDevice) e.isPortReserved = false } @@ -1951,6 +1974,15 @@ func (e *endpoint) listen(backlog int) *tcpip.Error { return nil } + if e.state == StateInitial { + // The listen is called on an unbound socket, the socket is + // automatically bound to a random free port with the local + // address set to INADDR_ANY. + if err := e.bindLocked(tcpip.FullAddress{}); err != nil { + return err + } + } + // Endpoint must be bound before it can transition to listen mode. if e.state != StateBound { e.stats.ReadErrors.InvalidEndpointState.Increment() @@ -2010,6 +2042,10 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) { e.mu.Lock() defer e.mu.Unlock() + return e.bindLocked(addr) +} + +func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) { // Don't allow binding once endpoint is not in the initial state // anymore. This is because once the endpoint goes into a connected or // listen state, it is already bound. @@ -2034,28 +2070,33 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) { } } - port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice) + flags := ports.Flags{ + LoadBalanced: e.reusePort, + } + port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, flags, e.bindToDevice) if err != nil { return err } e.boundBindToDevice = e.bindToDevice + e.boundPortFlags = flags e.isPortReserved = true e.effectiveNetProtos = netProtos e.ID.LocalPort = port // Any failures beyond this point must remove the port registration. - defer func(bindToDevice tcpip.NICID) { + defer func(portFlags ports.Flags, bindToDevice tcpip.NICID) { if err != nil { - e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice) + e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, portFlags, bindToDevice) e.isPortReserved = false e.effectiveNetProtos = nil e.ID.LocalPort = 0 e.ID.LocalAddress = "" e.boundNICID = 0 e.boundBindToDevice = 0 + e.boundPortFlags = ports.Flags{} } - }(e.boundBindToDevice) + }(e.boundPortFlags, e.boundBindToDevice) // If an address is specified, we must ensure that it's one of our // local addresses. diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 89b965c23..bc718064c 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -162,13 +162,26 @@ func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Transpo func replyWithReset(s *segment) { // Get the seqnum from the packet if the ack flag is set. seq := seqnum.Value(0) + ack := seqnum.Value(0) + flags := byte(header.TCPFlagRst) + // As per RFC 793 page 35 (Reset Generation) + // 1. If the connection does not exist (CLOSED) then a reset is sent + // in response to any incoming segment except another reset. In + // particular, SYNs addressed to a non-existent connection are rejected + // by this means. + + // If the incoming segment has an ACK field, the reset takes its + // sequence number from the ACK field of the segment, otherwise the + // reset has sequence number zero and the ACK field is set to the sum + // of the sequence number and segment length of the incoming segment. + // The connection remains in the CLOSED state. if s.flagIsSet(header.TCPFlagAck) { seq = s.ackNumber + } else { + flags |= header.TCPFlagAck + ack = s.sequenceNumber.Add(s.logicalLen()) } - - ack := s.sequenceNumber.Add(s.logicalLen()) - - sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */) + sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, flags, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */) } // SetOption implements TransportProtocol.SetOption. diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go index 857dc445f..0a5534959 100644 --- a/pkg/tcpip/transport/tcp/rcv.go +++ b/pkg/tcpip/transport/tcp/rcv.go @@ -50,16 +50,20 @@ type receiver struct { pendingRcvdSegments segmentHeap pendingBufUsed seqnum.Size pendingBufSize seqnum.Size + + // Time when the last ack was received. + lastRcvdAckTime time.Time `state:".(unixTime)"` } func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8, pendingBufSize seqnum.Size) *receiver { return &receiver{ - ep: ep, - rcvNxt: irs + 1, - rcvAcc: irs.Add(rcvWnd + 1), - rcvWnd: rcvWnd, - rcvWndScale: rcvWndScale, - pendingBufSize: pendingBufSize, + ep: ep, + rcvNxt: irs + 1, + rcvAcc: irs.Add(rcvWnd + 1), + rcvWnd: rcvWnd, + rcvWndScale: rcvWndScale, + pendingBufSize: pendingBufSize, + lastRcvdAckTime: time.Now(), } } @@ -205,7 +209,7 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum // Handle ACK (not FIN-ACK, which we handled above) during one of the // shutdown states. - if s.flagIsSet(header.TCPFlagAck) { + if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt { r.ep.mu.Lock() switch r.ep.state { case StateFinWait1: @@ -360,6 +364,9 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) { return true, nil } + // Store the time of the last ack. + r.lastRcvdAckTime = time.Now() + // Defer segment processing if it can't be consumed now. if !r.consumeSegment(s, segSeq, segLen) { if segLen > 0 || s.flagIsSet(header.TCPFlagFin) { diff --git a/pkg/tcpip/transport/tcp/rcv_state.go b/pkg/tcpip/transport/tcp/rcv_state.go new file mode 100644 index 000000000..2bf21a2e7 --- /dev/null +++ b/pkg/tcpip/transport/tcp/rcv_state.go @@ -0,0 +1,29 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp + +import ( + "time" +) + +// saveLastRcvdAckTime is invoked by stateify. +func (r *receiver) saveLastRcvdAckTime() unixTime { + return unixTime{r.lastRcvdAckTime.Unix(), r.lastRcvdAckTime.UnixNano()} +} + +// loadLastRcvdAckTime is invoked by stateify. +func (r *receiver) loadLastRcvdAckTime(unix unixTime) { + r.lastRcvdAckTime = time.Unix(unix.second, unix.nano) +} diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index d3f7c9125..8a947dc66 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -28,8 +28,11 @@ import ( ) const ( - // minRTO is the minimum allowed value for the retransmit timeout. - minRTO = 200 * time.Millisecond + // MinRTO is the minimum allowed value for the retransmit timeout. + MinRTO = 200 * time.Millisecond + + // MaxRTO is the maximum allowed value for the retransmit timeout. + MaxRTO = 120 * time.Second // InitialCwnd is the initial congestion window. InitialCwnd = 10 @@ -134,6 +137,10 @@ type sender struct { // rttMeasureTime is the time when the rttMeasureSeqNum was sent. rttMeasureTime time.Time `state:".(unixTime)"` + // firstRetransmittedSegXmitTime is the original transmit time of + // the first segment that was retransmitted due to RTO expiration. + firstRetransmittedSegXmitTime time.Time `state:".(unixTime)"` + closed bool writeNext *segment writeList segmentList @@ -392,8 +399,8 @@ func (s *sender) updateRTO(rtt time.Duration) { s.rto = s.rtt.srtt + 4*s.rtt.rttvar s.rtt.Unlock() - if s.rto < minRTO { - s.rto = minRTO + if s.rto < MinRTO { + s.rto = MinRTO } } @@ -438,8 +445,30 @@ func (s *sender) retransmitTimerExpired() bool { s.ep.stack.Stats().TCP.Timeouts.Increment() s.ep.stats.SendErrors.Timeouts.Increment() - // Give up if we've waited more than a minute since the last resend. - if s.rto >= 60*time.Second { + // Give up if we've waited more than a minute since the last resend or + // if a user time out is set and we have exceeded the user specified + // timeout since the first retransmission. + s.ep.mu.RLock() + uto := s.ep.userTimeout + s.ep.mu.RUnlock() + + if s.firstRetransmittedSegXmitTime.IsZero() { + // We store the original xmitTime of the segment that we are + // about to retransmit as the retransmission time. This is + // required as by the time the retransmitTimer has expired the + // segment has already been sent and unacked for the RTO at the + // time the segment was sent. + s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime + } + + elapsed := time.Since(s.firstRetransmittedSegXmitTime) + remaining := MaxRTO + if uto != 0 { + // Cap to the user specified timeout if one is specified. + remaining = uto - elapsed + } + + if remaining <= 0 || s.rto >= MaxRTO { return false } @@ -447,6 +476,11 @@ func (s *sender) retransmitTimerExpired() bool { // below. s.rto *= 2 + // Cap RTO to remaining time. + if s.rto > remaining { + s.rto = remaining + } + // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. // // Retransmit timeouts: @@ -674,7 +708,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se default: s.ep.state = StateFinWait1 } - s.ep.stack.Stats().TCP.CurrentEstablished.Decrement() s.ep.mu.Unlock() } else { // We're sending a non-FIN segment. @@ -1169,6 +1202,8 @@ func (s *sender) handleRcvdSegment(seg *segment) { // RFC 6298 Rule 5.3 if s.sndUna == s.sndNxt { s.outstanding = 0 + // Reset firstRetransmittedSegXmitTime to the zero value. + s.firstRetransmittedSegXmitTime = time.Time{} s.resendTimer.disable() } } diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go index 12eff8afc..8b20c3455 100644 --- a/pkg/tcpip/transport/tcp/snd_state.go +++ b/pkg/tcpip/transport/tcp/snd_state.go @@ -48,3 +48,13 @@ func (s *sender) loadRttMeasureTime(unix unixTime) { func (s *sender) afterLoad() { s.resendTimer.init(&s.resendWaker) } + +// saveFirstRetransmittedSegXmitTime is invoked by stateify. +func (s *sender) saveFirstRetransmittedSegXmitTime() unixTime { + return unixTime{s.firstRetransmittedSegXmitTime.Unix(), s.firstRetransmittedSegXmitTime.UnixNano()} +} + +// loadFirstRetransmittedSegXmitTime is invoked by stateify. +func (s *sender) loadFirstRetransmittedSegXmitTime(unix unixTime) { + s.firstRetransmittedSegXmitTime = time.Unix(unix.second, unix.nano) +} diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index c4b45aa6f..e8fe4dab5 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -75,6 +75,20 @@ func TestGiveUpConnect(t *testing.T) { if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != tcpip.ErrAborted { t.Fatalf("got ep.GetSockOpt(tcpip.ErrorOption{}) = %v, want = %v", err, tcpip.ErrAborted) } + + // Call Connect again to retreive the handshake failure status + // and stats updates. + if err := ep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrAborted { + t.Fatalf("got ep.Connect(...) = %v, want = %v", err, tcpip.ErrAborted) + } + + if got := c.Stack().Stats().TCP.FailedConnectionAttempts.Value(); got != 1 { + t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %v, want = 1", got) + } + + if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 { + t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got) + } } func TestConnectIncrementActiveConnection(t *testing.T) { @@ -309,8 +323,8 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) { checker.SrcPort(context.StackPort), checker.DstPort(context.TestPort), checker.SeqNum(uint32(c.IRS+1)), - checker.AckNum(uint32(iss)+1), - checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck))) + checker.AckNum(0), + checker.TCPFlags(header.TCPFlagRst))) } func TestTCPResetsReceivedIncrement(t *testing.T) { @@ -446,18 +460,17 @@ func TestConnectResetAfterClose(t *testing.T) { checker.TCP( checker.DstPort(context.TestPort), checker.SeqNum(uint32(c.IRS)+2), - checker.AckNum(790), - checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst), + checker.AckNum(0), + checker.TCPFlags(header.TCPFlagRst), ), ) break } } -// TestClosingWithEnqueuedSegments tests handling of -// still enqueued segments when the endpoint transitions -// to StateClose. The in-flight segments would be re-enqueued -// to a any listening endpoint. +// TestClosingWithEnqueuedSegments tests handling of still enqueued segments +// when the endpoint transitions to StateClose. The in-flight segments would be +// re-enqueued to a any listening endpoint. func TestClosingWithEnqueuedSegments(t *testing.T) { c := context.New(t, defaultMTU) defer c.Cleanup() @@ -541,21 +554,29 @@ func TestClosingWithEnqueuedSegments(t *testing.T) { ep.(interface{ ResumeWork() }).ResumeWork() // Wait for the protocolMainLoop to resume and update state. - time.Sleep(1 * time.Millisecond) + time.Sleep(10 * time.Millisecond) // Expect the endpoint to be closed. if got, want := tcp.EndpointState(ep.State()), tcp.StateClose; got != want { t.Errorf("Unexpected endpoint state: want %v, got %v", want, got) } + if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != 1 { + t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = 1", got) + } + + if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 { + t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got) + } + // Check if the endpoint was moved to CLOSED and netstack a reset in // response to the ACK packet that we sent after last-ACK. checker.IPv4(t, c.GetPacket(), checker.TCP( checker.DstPort(context.TestPort), checker.SeqNum(uint32(c.IRS)+2), - checker.AckNum(793), - checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst), + checker.AckNum(0), + checker.TCPFlags(header.TCPFlagRst), ), ) } @@ -792,6 +813,82 @@ func TestSendRstOnListenerRxSynAckV6(t *testing.T) { checker.SeqNum(200))) } +// TestTCPAckBeforeAcceptV4 tests that once the 3-way handshake is complete, +// peers can send data and expect a response within a reasonable ammount of time +// without calling Accept on the listening endpoint first. +// +// This test uses IPv4. +func TestTCPAckBeforeAcceptV4(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + c.Create(-1) + + if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { + t.Fatal("Bind failed:", err) + } + + if err := c.EP.Listen(10); err != nil { + t.Fatal("Listen failed:", err) + } + + irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */) + + // Send data before accepting the connection. + c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagAck, + SeqNum: irs + 1, + AckNum: iss + 1, + }) + + // Receive ACK for the data we sent. + checker.IPv4(t, c.GetPacket(), checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagAck), + checker.SeqNum(uint32(iss+1)), + checker.AckNum(uint32(irs+5)))) +} + +// TestTCPAckBeforeAcceptV6 tests that once the 3-way handshake is complete, +// peers can send data and expect a response within a reasonable ammount of time +// without calling Accept on the listening endpoint first. +// +// This test uses IPv6. +func TestTCPAckBeforeAcceptV6(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + c.CreateV6Endpoint(true) + + if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { + t.Fatal("Bind failed:", err) + } + + if err := c.EP.Listen(10); err != nil { + t.Fatal("Listen failed:", err) + } + + irs, iss := executeV6Handshake(t, c, context.TestPort, false /* synCookiesInUse */) + + // Send data before accepting the connection. + c.SendV6Packet([]byte{1, 2, 3, 4}, &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagAck, + SeqNum: irs + 1, + AckNum: iss + 1, + }) + + // Receive ACK for the data we sent. + checker.IPv6(t, c.GetV6Packet(), checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagAck), + checker.SeqNum(uint32(iss+1)), + checker.AckNum(uint32(irs+5)))) +} + func TestSendRstOnListenerRxAckV4(t *testing.T) { c := context.New(t, defaultMTU) defer c.Cleanup() @@ -816,7 +913,7 @@ func TestSendRstOnListenerRxAckV4(t *testing.T) { checker.IPv4(t, c.GetPacket(), checker.TCP( checker.DstPort(context.TestPort), - checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck), + checker.TCPFlags(header.TCPFlagRst), checker.SeqNum(200))) } @@ -844,7 +941,7 @@ func TestSendRstOnListenerRxAckV6(t *testing.T) { checker.IPv6(t, c.GetV6Packet(), checker.TCP( checker.DstPort(context.TestPort), - checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck), + checker.TCPFlags(header.TCPFlagRst), checker.SeqNum(200))) } @@ -1043,6 +1140,71 @@ func TestConnectBindToDevice(t *testing.T) { } } +func TestRstOnSynSent(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + // Create an endpoint, don't handshake because we want to interfere with the + // handshake process. + c.Create(-1) + + // Start connection attempt. + waitEntry, ch := waiter.NewChannelEntry(nil) + c.WQ.EventRegister(&waitEntry, waiter.EventOut) + defer c.WQ.EventUnregister(&waitEntry) + + addr := tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort} + if err := c.EP.Connect(addr); err != tcpip.ErrConnectStarted { + t.Fatalf("got Connect(%+v) = %v, want %s", addr, err, tcpip.ErrConnectStarted) + } + + // Receive SYN packet. + b := c.GetPacket() + checker.IPv4(t, b, + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagSyn), + ), + ) + + // Ensure that we've reached SynSent state + if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want { + t.Fatalf("got State() = %s, want %s", got, want) + } + tcpHdr := header.TCP(header.IPv4(b).Payload()) + c.IRS = seqnum.Value(tcpHdr.SequenceNumber()) + + // Send a packet with a proper ACK and a RST flag to cause the socket + // to Error and close out + iss := seqnum.Value(789) + rcvWnd := seqnum.Size(30000) + c.SendPacket(nil, &context.Headers{ + SrcPort: tcpHdr.DestinationPort(), + DstPort: tcpHdr.SourcePort(), + Flags: header.TCPFlagRst | header.TCPFlagAck, + SeqNum: iss, + AckNum: c.IRS.Add(1), + RcvWnd: rcvWnd, + TCPOpts: nil, + }) + + // Wait for receive to be notified. + select { + case <-ch: + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for packet to arrive") + } + + if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionRefused { + t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrConnectionRefused) + } + + // Due to the RST the endpoint should be in an error state. + if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want { + t.Fatalf("got State() = %s, want %s", got, want) + } +} + func TestOutOfOrderReceive(t *testing.T) { c := context.New(t, defaultMTU) defer c.Cleanup() @@ -2618,6 +2780,13 @@ loop: if tcp.EndpointState(c.EP.State()) != tcp.StateError { t.Fatalf("got EP state is not StateError") } + + if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 { + t.Errorf("got stats.TCP.EstablishedResets.Value() = %v, want = 1", got) + } + if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 { + t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got) + } } func TestSendOnResetConnection(t *testing.T) { @@ -4186,8 +4355,9 @@ func TestKeepalive(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) + const keepAliveInterval = 10 * time.Millisecond c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond)) - c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(10 * time.Millisecond)) + c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval)) c.EP.SetSockOpt(tcpip.KeepaliveCountOption(5)) c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1)) @@ -4277,19 +4447,43 @@ func TestKeepalive(t *testing.T) { ) } + // Sleep for a litte over the KeepAlive interval to make sure + // the timer has time to fire after the last ACK and close the + // close the socket. + time.Sleep(keepAliveInterval + 5*time.Millisecond) + // The connection should be terminated after 5 unacked keepalives. + // Send an ACK to trigger a RST from the stack as the endpoint should + // be dead. + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: c.Port, + Flags: header.TCPFlagAck, + SeqNum: 790, + AckNum: seqnum.Value(next), + RcvWnd: 30000, + }) + checker.IPv4(t, c.GetPacket(), checker.TCP( checker.DstPort(context.TestPort), checker.SeqNum(uint32(next)), - checker.AckNum(uint32(790)), - checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst), + checker.AckNum(uint32(0)), + checker.TCPFlags(header.TCPFlagRst), ), ) + if got := c.Stack().Stats().TCP.EstablishedTimedout.Value(); got != 1 { + t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %v, want = 1", got) + } + if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout { t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout) } + + if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 { + t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got) + } } func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) { @@ -4303,7 +4497,7 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki RcvWnd: 30000, }) - // Receive the SYN-ACK reply.w + // Receive the SYN-ACK reply. b := c.GetPacket() tcp := header.TCP(header.IPv4(b).Payload()) iss = seqnum.Value(tcp.SequenceNumber()) @@ -4336,6 +4530,50 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki return irs, iss } +func executeV6Handshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) { + // Send a SYN request. + irs = seqnum.Value(789) + c.SendV6Packet(nil, &context.Headers{ + SrcPort: srcPort, + DstPort: context.StackPort, + Flags: header.TCPFlagSyn, + SeqNum: irs, + RcvWnd: 30000, + }) + + // Receive the SYN-ACK reply. + b := c.GetV6Packet() + tcp := header.TCP(header.IPv6(b).Payload()) + iss = seqnum.Value(tcp.SequenceNumber()) + tcpCheckers := []checker.TransportChecker{ + checker.SrcPort(context.StackPort), + checker.DstPort(srcPort), + checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn), + checker.AckNum(uint32(irs) + 1), + } + + if synCookieInUse { + // When cookies are in use window scaling is disabled. + tcpCheckers = append(tcpCheckers, checker.TCPSynOptions(header.TCPSynOptions{ + WS: -1, + MSS: c.MSSWithoutOptionsV6(), + })) + } + + checker.IPv6(t, b, checker.TCP(tcpCheckers...)) + + // Send ACK. + c.SendV6Packet(nil, &context.Headers{ + SrcPort: srcPort, + DstPort: context.StackPort, + Flags: header.TCPFlagAck, + SeqNum: irs + 1, + AckNum: iss + 1, + RcvWnd: 30000, + }) + return irs, iss +} + // TestListenBacklogFull tests that netstack does not complete handshakes if the // listen backlog for the endpoint is full. func TestListenBacklogFull(t *testing.T) { @@ -5512,6 +5750,7 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) { DstPort: context.StackPort, Flags: header.TCPFlagSyn, SeqNum: iss, + RcvWnd: 30000, }) // Receive the SYN-ACK reply. @@ -5630,6 +5869,7 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) { DstPort: context.StackPort, Flags: header.TCPFlagSyn, SeqNum: iss, + RcvWnd: 30000, }) // Receive the SYN-ACK reply. @@ -5736,6 +5976,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) { DstPort: context.StackPort, Flags: header.TCPFlagSyn, SeqNum: iss, + RcvWnd: 30000, }) // Receive the SYN-ACK reply. @@ -5809,6 +6050,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) { DstPort: context.StackPort, Flags: header.TCPFlagSyn, SeqNum: iss, + RcvWnd: 30000, }) c.CheckNoPacketTimeout("unexpected packet received in response to SYN", 1*time.Second) @@ -5821,6 +6063,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) { DstPort: context.StackPort, Flags: header.TCPFlagSyn, SeqNum: iss, + RcvWnd: 30000, }) // Receive the SYN-ACK reply. @@ -5867,6 +6110,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) { t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err) } + want := c.Stack().Stats().TCP.EstablishedClosed.Value() + 1 + wq := &waiter.Queue{} ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq) if err != nil { @@ -5887,6 +6132,7 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) { DstPort: context.StackPort, Flags: header.TCPFlagSyn, SeqNum: iss, + RcvWnd: 30000, }) // Receive the SYN-ACK reply. @@ -5992,6 +6238,326 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) { checker.SrcPort(context.StackPort), checker.DstPort(context.TestPort), checker.SeqNum(uint32(ackHeaders.AckNum)), - checker.AckNum(uint32(ackHeaders.SeqNum)), - checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck))) + checker.AckNum(0), + checker.TCPFlags(header.TCPFlagRst))) + + if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != want { + t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = %v", got, want) + } + if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 { + t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got) + } +} + +func TestTCPCloseWithData(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + // Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed + // after 5 seconds in TIME_WAIT state. + tcpTimeWaitTimeout := 5 * time.Second + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil { + t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err) + } + + wq := &waiter.Queue{} + ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq) + if err != nil { + t.Fatalf("NewEndpoint failed: %s", err) + } + if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { + t.Fatalf("Bind failed: %s", err) + } + + if err := ep.Listen(10); err != nil { + t.Fatalf("Listen failed: %s", err) + } + + // Send a SYN request. + iss := seqnum.Value(789) + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagSyn, + SeqNum: iss, + RcvWnd: 30000, + }) + + // Receive the SYN-ACK reply. + b := c.GetPacket() + tcpHdr := header.TCP(header.IPv4(b).Payload()) + c.IRS = seqnum.Value(tcpHdr.SequenceNumber()) + + ackHeaders := &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagAck, + SeqNum: iss + 1, + AckNum: c.IRS + 1, + RcvWnd: 30000, + } + + // Send ACK. + c.SendPacket(nil, ackHeaders) + + // Try to accept the connection. + we, ch := waiter.NewChannelEntry(nil) + wq.EventRegister(&we, waiter.EventIn) + defer wq.EventUnregister(&we) + + c.EP, _, err = ep.Accept() + if err == tcpip.ErrWouldBlock { + // Wait for connection to be established. + select { + case <-ch: + c.EP, _, err = ep.Accept() + if err != nil { + t.Fatalf("Accept failed: %s", err) + } + + case <-time.After(1 * time.Second): + t.Fatalf("Timed out waiting for accept") + } + } + + // Now trigger a passive close by sending a FIN. + finHeaders := &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagAck | header.TCPFlagFin, + SeqNum: iss + 1, + AckNum: c.IRS + 2, + RcvWnd: 30000, + } + + c.SendPacket(nil, finHeaders) + + // Get the ACK to the FIN we just sent. + checker.IPv4(t, c.GetPacket(), checker.TCP( + checker.SrcPort(context.StackPort), + checker.DstPort(context.TestPort), + checker.SeqNum(uint32(c.IRS+1)), + checker.AckNum(uint32(iss)+2), + checker.TCPFlags(header.TCPFlagAck))) + + // Now write a few bytes and then close the endpoint. + data := []byte{1, 2, 3} + view := buffer.NewView(len(data)) + copy(view, data) + + if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + t.Fatalf("Write failed: %s", err) + } + + // Check that data is received. + b = c.GetPacket() + checker.IPv4(t, b, + checker.PayloadLen(len(data)+header.TCPMinimumSize), + checker.TCP( + checker.DstPort(context.TestPort), + checker.SeqNum(uint32(c.IRS)+1), + checker.AckNum(uint32(iss)+2), // Acknum is initial sequence number + 1 + checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)), + ), + ) + + if p := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(data, p) { + t.Errorf("got data = %x, want = %x", p, data) + } + + c.EP.Close() + // Check the FIN. + checker.IPv4(t, c.GetPacket(), checker.TCP( + checker.SrcPort(context.StackPort), + checker.DstPort(context.TestPort), + checker.SeqNum(uint32(c.IRS+1)+uint32(len(data))), + checker.AckNum(uint32(iss+2)), + checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck))) + + // First send a partial ACK. + ackHeaders = &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagAck, + SeqNum: iss + 2, + AckNum: c.IRS + 1 + seqnum.Value(len(data)-1), + RcvWnd: 30000, + } + c.SendPacket(nil, ackHeaders) + + // Now send a full ACK. + ackHeaders = &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagAck, + SeqNum: iss + 2, + AckNum: c.IRS + 1 + seqnum.Value(len(data)), + RcvWnd: 30000, + } + c.SendPacket(nil, ackHeaders) + + // Now ACK the FIN. + ackHeaders.AckNum++ + c.SendPacket(nil, ackHeaders) + + // Now send an ACK and we should get a RST back as the endpoint should + // be in CLOSED state. + ackHeaders = &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagAck, + SeqNum: iss + 2, + AckNum: c.IRS + 1 + seqnum.Value(len(data)), + RcvWnd: 30000, + } + c.SendPacket(nil, ackHeaders) + + // Check the RST. + checker.IPv4(t, c.GetPacket(), checker.TCP( + checker.SrcPort(context.StackPort), + checker.DstPort(context.TestPort), + checker.SeqNum(uint32(ackHeaders.AckNum)), + checker.AckNum(0), + checker.TCPFlags(header.TCPFlagRst))) +} + +func TestTCPUserTimeout(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + c.CreateConnected(789, 30000, -1 /* epRcvBuf */) + + origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value() + + userTimeout := 50 * time.Millisecond + c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout)) + + // Send some data and wait before ACKing it. + view := buffer.NewView(3) + if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + t.Fatalf("Write failed: %v", err) + } + + next := uint32(c.IRS) + 1 + checker.IPv4(t, c.GetPacket(), + checker.PayloadLen(len(view)+header.TCPMinimumSize), + checker.TCP( + checker.DstPort(context.TestPort), + checker.SeqNum(next), + checker.AckNum(790), + checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)), + ), + ) + + // Wait for a little over the minimum retransmit timeout of 200ms for + // the retransmitTimer to fire and close the connection. + time.Sleep(tcp.MinRTO + 10*time.Millisecond) + + // No packet should be received as the connection should be silently + // closed due to timeout. + c.CheckNoPacket("unexpected packet received after userTimeout has expired") + + next += uint32(len(view)) + + // The connection should be terminated after userTimeout has expired. + // Send an ACK to trigger a RST from the stack as the endpoint should + // be dead. + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: c.Port, + Flags: header.TCPFlagAck, + SeqNum: 790, + AckNum: seqnum.Value(next), + RcvWnd: 30000, + }) + + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.SeqNum(uint32(next)), + checker.AckNum(uint32(0)), + checker.TCPFlags(header.TCPFlagRst), + ), + ) + + if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout { + t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout) + } + + if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want { + t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want) + } +} + +func TestKeepaliveWithUserTimeout(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + c.CreateConnected(789, 30000, -1 /* epRcvBuf */) + + origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value() + + const keepAliveInterval = 10 * time.Millisecond + c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond)) + c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval)) + c.EP.SetSockOpt(tcpip.KeepaliveCountOption(10)) + c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1)) + + // Set userTimeout to be the duration for 3 keepalive probes. + userTimeout := 30 * time.Millisecond + c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout)) + + // Check that the connection is still alive. + if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock { + t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock) + } + + // Now receive 2 keepalives, but don't ACK them. The connection should + // be reset when the 3rd one should be sent due to userTimeout being + // 30ms and each keepalive probe should be sent 10ms apart as set above after + // the connection has been idle for 10ms. + for i := 0; i < 2; i++ { + b := c.GetPacket() + checker.IPv4(t, b, + checker.TCP( + checker.DstPort(context.TestPort), + checker.SeqNum(uint32(c.IRS)), + checker.AckNum(uint32(790)), + checker.TCPFlags(header.TCPFlagAck), + ), + ) + } + + // Sleep for a litte over the KeepAlive interval to make sure + // the timer has time to fire after the last ACK and close the + // close the socket. + time.Sleep(keepAliveInterval + 5*time.Millisecond) + + // The connection should be terminated after 30ms. + // Send an ACK to trigger a RST from the stack as the endpoint should + // be dead. + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: c.Port, + Flags: header.TCPFlagAck, + SeqNum: 790, + AckNum: seqnum.Value(c.IRS + 1), + RcvWnd: 30000, + }) + + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.SeqNum(uint32(c.IRS+1)), + checker.AckNum(uint32(0)), + checker.TCPFlags(header.TCPFlagRst), + ), + ) + + if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout { + t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout) + } + if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want { + t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want) + } } diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go index 04fdaaed1..b0a376eba 100644 --- a/pkg/tcpip/transport/tcp/testing/context/context.go +++ b/pkg/tcpip/transport/tcp/testing/context/context.go @@ -231,6 +231,7 @@ func (c *Context) CheckNoPacket(errMsg string) { // addresses. It will fail with an error if no packet is received for // 2 seconds. func (c *Context) GetPacket() []byte { + c.t.Helper() select { case p := <-c.linkEP.C: if p.Proto != ipv4.ProtocolNumber { @@ -259,6 +260,7 @@ func (c *Context) GetPacket() []byte { // and destination address. If no packet is available it will return // nil immediately. func (c *Context) GetPacketNonBlocking() []byte { + c.t.Helper() select { case p := <-c.linkEP.C: if p.Proto != ipv4.ProtocolNumber { @@ -483,6 +485,7 @@ func (c *Context) CreateV6Endpoint(v6only bool) { // GetV6Packet reads a single packet from the link layer endpoint of the context // and asserts that it is an IPv6 Packet with the expected src/dest addresses. func (c *Context) GetV6Packet() []byte { + c.t.Helper() select { case p := <-c.linkEP.C: if p.Proto != ipv6.ProtocolNumber { @@ -1089,3 +1092,9 @@ func (c *Context) SetGSOEnabled(enable bool) { func (c *Context) MSSWithoutOptions() uint16 { return uint16(c.linkEP.MTU() - header.IPv4MinimumSize - header.TCPMinimumSize) } + +// MSSWithoutOptionsV6 returns the value for the MSS used by the stack when no +// options are in use for IPv6 packets. +func (c *Context) MSSWithoutOptionsV6() uint16 { + return uint16(c.linkEP.MTU() - header.IPv6MinimumSize - header.TCPMinimumSize) +} diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD index 8d4c3808f..97e4d5825 100644 --- a/pkg/tcpip/transport/udp/BUILD +++ b/pkg/tcpip/transport/udp/BUILD @@ -34,6 +34,7 @@ go_library( "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/iptables", + "//pkg/tcpip/ports", "//pkg/tcpip/stack", "//pkg/tcpip/transport/raw", "//pkg/waiter", diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 24cb88c13..1ac4705af 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/tcpip/ports" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/waiter" ) @@ -107,6 +108,7 @@ type endpoint struct { // Values used to reserve a port or register a transport endpoint. // (which ever happens first). boundBindToDevice tcpip.NICID + boundPortFlags ports.Flags // sendTOS represents IPv4 TOS or IPv6 TrafficClass, // applied while sending packets. Defaults to 0 as on Linux. @@ -180,8 +182,9 @@ func (e *endpoint) Close() { switch e.state { case StateBound, StateConnected: e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice) - e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice) + e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice) e.boundBindToDevice = 0 + e.boundPortFlags = ports.Flags{} } for _, mem := range e.multicastMemberships { @@ -895,7 +898,8 @@ func (e *endpoint) Disconnect() *tcpip.Error { } else { if e.ID.LocalPort != 0 { // Release the ephemeral port. - e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice) + e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice) + e.boundPortFlags = ports.Flags{} } e.state = StateInitial } @@ -1042,16 +1046,23 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) { if e.ID.LocalPort == 0 { - port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.reusePort, e.bindToDevice) + flags := ports.Flags{ + LoadBalanced: e.reusePort, + // FIXME(b/129164367): Support SO_REUSEADDR. + MostRecent: false, + } + port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, flags, e.bindToDevice) if err != nil { return id, e.bindToDevice, err } + e.boundPortFlags = flags id.LocalPort = port } err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) if err != nil { - e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.bindToDevice) + e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.boundPortFlags, e.bindToDevice) + e.boundPortFlags = ports.Flags{} } return id, e.bindToDevice, err } @@ -1134,9 +1145,14 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() + addr := e.ID.LocalAddress + if e.state == StateConnected { + addr = e.route.LocalAddress + } + return tcpip.FullAddress{ NIC: e.RegisterNICID, - Addr: e.ID.LocalAddress, + Addr: addr, Port: e.ID.LocalPort, }, nil } diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 847d2f91c..6226b63f8 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "compat.go", "compat_amd64.go", + "compat_arm64.go", "config.go", "controller.go", "debug.go", @@ -110,7 +111,6 @@ go_test( "//pkg/control/server", "//pkg/log", "//pkg/p9", - "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/context/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 07e35ab10..352e710d2 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -21,10 +21,8 @@ import ( "syscall" "github.com/golang/protobuf/proto" - "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/arch" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" "gvisor.dev/gvisor/pkg/sentry/strace" @@ -53,9 +51,9 @@ type compatEmitter struct { } func newCompatEmitter(logFD int) (*compatEmitter, error) { - nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64) + nameMap, ok := getSyscallNameMap() if !ok { - return nil, fmt.Errorf("amd64 Linux syscall table not found") + return nil, fmt.Errorf("Linux syscall table not found") } c := &compatEmitter{ @@ -86,16 +84,16 @@ func (c *compatEmitter) Emit(msg proto.Message) (bool, error) { } func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) { - regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64 + regs := us.Registers c.mu.Lock() defer c.mu.Unlock() - sysnr := regs.OrigRax + sysnr := syscallNum(regs) tr := c.trackers[sysnr] if tr == nil { switch sysnr { - case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL: + case syscall.SYS_PRCTL: // args: cmd, ... tr = newArgsTracker(0) @@ -112,10 +110,14 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) { tr = newArgsTracker(2) default: - tr = &onceTracker{} + tr = newArchArgsTracker(sysnr) + if tr == nil { + tr = &onceTracker{} + } } c.trackers[sysnr] = tr } + if tr.shouldReport(regs) { c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs) tr.onReported(regs) @@ -139,10 +141,10 @@ func (c *compatEmitter) Close() error { // the syscall and arguments. type syscallTracker interface { // shouldReport returns true is the syscall should be reported. - shouldReport(regs *rpb.AMD64Registers) bool + shouldReport(regs *rpb.Registers) bool // onReported marks the syscall as reported. - onReported(regs *rpb.AMD64Registers) + onReported(regs *rpb.Registers) } // onceTracker reports only a single time, used for most syscalls. @@ -150,10 +152,45 @@ type onceTracker struct { reported bool } -func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool { +func (o *onceTracker) shouldReport(_ *rpb.Registers) bool { return !o.reported } -func (o *onceTracker) onReported(_ *rpb.AMD64Registers) { +func (o *onceTracker) onReported(_ *rpb.Registers) { o.reported = true } + +// argsTracker reports only once for each different combination of arguments. +// It's used for generic syscalls like ioctl to report once per 'cmd'. +type argsTracker struct { + // argsIdx is the syscall arguments to use as unique ID. + argsIdx []int + reported map[string]struct{} + count int +} + +func newArgsTracker(argIdx ...int) *argsTracker { + return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})} +} + +// key returns the command based on the syscall argument index. +func (a *argsTracker) key(regs *rpb.Registers) string { + var rv string + for _, idx := range a.argsIdx { + rv += fmt.Sprintf("%d|", argVal(idx, regs)) + } + return rv +} + +func (a *argsTracker) shouldReport(regs *rpb.Registers) bool { + if a.count >= reportLimit { + return false + } + _, ok := a.reported[a.key(regs)] + return !ok +} + +func (a *argsTracker) onReported(regs *rpb.Registers) { + a.count++ + a.reported[a.key(regs)] = struct{}{} +} diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go index 43cd0db94..42b0ca8b0 100644 --- a/runsc/boot/compat_amd64.go +++ b/runsc/boot/compat_amd64.go @@ -16,62 +16,81 @@ package boot import ( "fmt" + "syscall" + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" ) // reportLimit is the max number of events that should be reported per tracker. const reportLimit = 100 -// argsTracker reports only once for each different combination of arguments. -// It's used for generic syscalls like ioctl to report once per 'cmd'. -type argsTracker struct { - // argsIdx is the syscall arguments to use as unique ID. - argsIdx []int - reported map[string]struct{} - count int +// newRegs create a empty Registers instance. +func newRegs() *rpb.Registers { + return &rpb.Registers{ + Arch: &rpb.Registers_Amd64{ + Amd64: &rpb.AMD64Registers{}, + }, + } } -func newArgsTracker(argIdx ...int) *argsTracker { - return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})} -} +func argVal(argIdx int, regs *rpb.Registers) uint32 { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 -// cmd returns the command based on the syscall argument index. -func (a *argsTracker) key(regs *rpb.AMD64Registers) string { - var rv string - for _, idx := range a.argsIdx { - rv += fmt.Sprintf("%d|", argVal(idx, regs)) + switch argIdx { + case 0: + return uint32(amd64Regs.Rdi) + case 1: + return uint32(amd64Regs.Rsi) + case 2: + return uint32(amd64Regs.Rdx) + case 3: + return uint32(amd64Regs.R10) + case 4: + return uint32(amd64Regs.R8) + case 5: + return uint32(amd64Regs.R9) } - return rv + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } -func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 { +func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + switch argIdx { case 0: - return uint32(regs.Rdi) + amd64Regs.Rdi = argVal case 1: - return uint32(regs.Rsi) + amd64Regs.Rsi = argVal case 2: - return uint32(regs.Rdx) + amd64Regs.Rdx = argVal case 3: - return uint32(regs.R10) + amd64Regs.R10 = argVal case 4: - return uint32(regs.R8) + amd64Regs.R8 = argVal case 5: - return uint32(regs.R9) + amd64Regs.R9 = argVal + default: + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } - panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } -func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool { - if a.count >= reportLimit { - return false - } - _, ok := a.reported[a.key(regs)] - return !ok +func getSyscallNameMap() (strace.SyscallMap, bool) { + return strace.Lookup(abi.Linux, arch.AMD64) +} + +func syscallNum(regs *rpb.Registers) uint64 { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + return amd64Regs.OrigRax } -func (a *argsTracker) onReported(regs *rpb.AMD64Registers) { - a.count++ - a.reported[a.key(regs)] = struct{}{} +func newArchArgsTracker(sysnr uint64) syscallTracker { + switch sysnr { + case syscall.SYS_ARCH_PRCTL: + // args: cmd, ... + return newArgsTracker(0) + } + return nil } diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go new file mode 100644 index 000000000..f784cd237 --- /dev/null +++ b/runsc/boot/compat_arm64.go @@ -0,0 +1,91 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" +) + +// reportLimit is the max number of events that should be reported per tracker. +const reportLimit = 100 + +// newRegs create a empty Registers instance. +func newRegs() *rpb.Registers { + return &rpb.Registers{ + Arch: &rpb.Registers_Arm64{ + Arm64: &rpb.ARM64Registers{}, + }, + } +} + +func argVal(argIdx int, regs *rpb.Registers) uint32 { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + + switch argIdx { + case 0: + return uint32(arm64Regs.R0) + case 1: + return uint32(arm64Regs.R1) + case 2: + return uint32(arm64Regs.R2) + case 3: + return uint32(arm64Regs.R3) + case 4: + return uint32(arm64Regs.R4) + case 5: + return uint32(arm64Regs.R5) + } + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) +} + +func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + + switch argIdx { + case 0: + arm64Regs.R0 = argVal + case 1: + arm64Regs.R1 = argVal + case 2: + arm64Regs.R2 = argVal + case 3: + arm64Regs.R3 = argVal + case 4: + arm64Regs.R4 = argVal + case 5: + arm64Regs.R5 = argVal + default: + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) + } +} + +func getSyscallNameMap() (strace.SyscallMap, bool) { + return strace.Lookup(abi.Linux, arch.ARM64) +} + +func syscallNum(regs *rpb.Registers) uint64 { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + return arm64Regs.R8 +} + +func newArchArgsTracker(sysnr uint64) syscallTracker { + // currently, no arch specific syscalls need to be handled here. + return nil +} diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go index 388298d8d..839c5303b 100644 --- a/runsc/boot/compat_test.go +++ b/runsc/boot/compat_test.go @@ -16,8 +16,6 @@ package boot import ( "testing" - - rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" ) func TestOnceTracker(t *testing.T) { @@ -35,31 +33,34 @@ func TestOnceTracker(t *testing.T) { func TestArgsTracker(t *testing.T) { for _, tc := range []struct { - name string - idx []int - rdi1 uint64 - rdi2 uint64 - rsi1 uint64 - rsi2 uint64 - want bool + name string + idx []int + arg1_1 uint64 + arg1_2 uint64 + arg2_1 uint64 + arg2_2 uint64 + want bool }{ - {name: "same rdi", idx: []int{0}, rdi1: 123, rdi2: 123, want: false}, - {name: "same rsi", idx: []int{1}, rsi1: 123, rsi2: 123, want: false}, - {name: "diff rdi", idx: []int{0}, rdi1: 123, rdi2: 321, want: true}, - {name: "diff rsi", idx: []int{1}, rsi1: 123, rsi2: 321, want: true}, - {name: "cmd is uint32", idx: []int{0}, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false}, - {name: "same 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 123, rdi2: 321, want: false}, - {name: "diff 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 789, rdi2: 987, want: true}, + {name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false}, + {name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false}, + {name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true}, + {name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true}, + {name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false}, + {name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false}, + {name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true}, } { t.Run(tc.name, func(t *testing.T) { c := newArgsTracker(tc.idx...) - regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1} + regs := newRegs() + setArgVal(0, tc.arg1_1, regs) + setArgVal(1, tc.arg2_1, regs) if !c.shouldReport(regs) { t.Error("first call to shouldReport, got: false, want: true") } c.onReported(regs) - regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2 + setArgVal(0, tc.arg1_2, regs) + setArgVal(1, tc.arg2_2, regs) if got := c.shouldReport(regs); tc.want != got { t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want) } @@ -70,7 +71,9 @@ func TestArgsTracker(t *testing.T) { func TestArgsTrackerLimit(t *testing.T) { c := newArgsTracker(0, 1) for i := 0; i < reportLimit; i++ { - regs := &rpb.AMD64Registers{Rdi: 123, Rsi: uint64(i)} + regs := newRegs() + setArgVal(0, 123, regs) + setArgVal(1, uint64(i), regs) if !c.shouldReport(regs) { t.Error("shouldReport before limit was reached, got: false, want: true") } @@ -78,7 +81,9 @@ func TestArgsTrackerLimit(t *testing.T) { } // Should hit the count limit now. - regs := &rpb.AMD64Registers{Rdi: 123, Rsi: 123456} + regs := newRegs() + setArgVal(0, 123, regs) + setArgVal(1, 123456, regs) if c.shouldReport(regs) { t.Error("shouldReport after limit was reached, got: true, want: false") } diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 72a33534f..a878bc2ce 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -250,6 +250,12 @@ type Config struct { // multiple tests are run in parallel, since there is no way to pass // parameters to the runtime from docker. TestOnlyTestNameEnv string + + // CPUNumFromQuota sets CPU number count to available CPU quota, using + // least integer value greater than or equal to quota. + // + // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. + CPUNumFromQuota bool } // ToFlags returns a slice of flags that correspond to the given Config. @@ -282,6 +288,9 @@ func (c *Config) ToFlags() []string { "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), } + if c.CPUNumFromQuota { + f = append(f, "--cpu-num-from-quota") + } // Only include these if set since it is never to be used by users. if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { f = append(f, "--TESTONLY-unsafe-nonroot=true") diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index f62be4c59..9c9e94864 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -152,7 +152,9 @@ func newController(fd int, l *Loader) (*controller, error) { srv.Register(&debug{}) srv.Register(&control.Logging{}) if l.conf.ProfileEnable { - srv.Register(&control.Profile{}) + srv.Register(&control.Profile{ + Kernel: l.k, + }) } return &controller{ diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 677356193..4fb9adca6 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -134,11 +134,6 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowValue(syscall.SOL_SOCKET), seccomp.AllowValue(syscall.SO_SNDBUF), }, - { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_REUSEADDR), - }, }, syscall.SYS_GETTID: {}, syscall.SYS_GETTIMEOFDAY: {}, @@ -317,6 +312,26 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_TOS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_RECVTOS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_TCLASS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + }, + { + seccomp.AllowAny{}, seccomp.AllowValue(syscall.SOL_IPV6), seccomp.AllowValue(syscall.IPV6_V6ONLY), }, @@ -418,6 +433,34 @@ func hostInetFilters() seccomp.SyscallRules { seccomp.AllowAny{}, seccomp.AllowValue(4), }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_TOS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_RECVTOS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_TCLASS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ { diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index bc9ffaf81..421ccd255 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -16,7 +16,6 @@ package boot import ( "fmt" - "path" "path/filepath" "sort" "strconv" @@ -52,7 +51,7 @@ const ( rootDevice = "9pfs-/" // MountPrefix is the annotation prefix for mount hints. - MountPrefix = "gvisor.dev/spec/mount" + MountPrefix = "dev.gvisor.spec.mount." // Filesystems that runsc supports. bind = "bind" @@ -490,14 +489,15 @@ type podMountHints struct { func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { mnts := make(map[string]*mountHint) for k, v := range spec.Annotations { - // Look for 'gvisor.dev/spec/mount' annotations and parse them. + // Look for 'dev.gvisor.spec.mount' annotations and parse them. if strings.HasPrefix(k, MountPrefix) { - parts := strings.Split(k, "/") - if len(parts) != 5 { + // Remove the prefix and split the rest. + parts := strings.Split(k[len(MountPrefix):], ".") + if len(parts) != 2 { return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v) } - name := parts[3] - if len(name) == 0 || path.Clean(name) != name { + name := parts[0] + if len(name) == 0 { return nil, fmt.Errorf("invalid mount name: %s", name) } mnt := mnts[name] @@ -505,7 +505,7 @@ func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { mnt = &mountHint{name: name} mnts[name] = mnt } - if err := mnt.setField(parts[4], v); err != nil { + if err := mnt.setField(parts[1], v); err != nil { return nil, err } } @@ -575,6 +575,11 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin func (c *containerMounter) processHints(conf *Config) error { ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfs { + continue + } log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) inode, err := c.mountSharedMaster(ctx, conf, hint) if err != nil { @@ -851,7 +856,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns return fmt.Errorf("mount %q error: %v", m.Destination, err) } - log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) + log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts) return nil } diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go index 0396a4cfb..912037075 100644 --- a/runsc/boot/fs_test.go +++ b/runsc/boot/fs_test.go @@ -15,7 +15,6 @@ package boot import ( - "path" "reflect" "strings" "testing" @@ -26,19 +25,19 @@ import ( func TestPodMountHintsHappy(t *testing.T) { spec := &specs.Spec{ Annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", - path.Join(MountPrefix, "mount2", "source"): "bar", - path.Join(MountPrefix, "mount2", "type"): "bind", - path.Join(MountPrefix, "mount2", "share"): "container", - path.Join(MountPrefix, "mount2", "options"): "rw,private", + MountPrefix + "mount2.source": "bar", + MountPrefix + "mount2.type": "bind", + MountPrefix + "mount2.share": "container", + MountPrefix + "mount2.options": "rw,private", }, } podHints, err := newPodMountHints(spec) if err != nil { - t.Errorf("newPodMountHints failed: %v", err) + t.Fatalf("newPodMountHints failed: %v", err) } // Check that fields were set correctly. @@ -86,95 +85,95 @@ func TestPodMountHintsErrors(t *testing.T) { { name: "too short", annotations: map[string]string{ - path.Join(MountPrefix, "mount1"): "foo", + MountPrefix + "mount1": "foo", }, error: "invalid mount annotation", }, { name: "no name", annotations: map[string]string{ - MountPrefix + "//source": "foo", + MountPrefix + ".source": "foo", }, error: "invalid mount name", }, { name: "missing source", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", }, error: "source field", }, { name: "missing type", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.share": "pod", }, error: "type field", }, { name: "missing share", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", }, error: "share field", }, { name: "invalid field name", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "invalid"): "foo", + MountPrefix + "mount1.invalid": "foo", }, error: "invalid mount annotation", }, { name: "invalid source", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", }, error: "source cannot be empty", }, { name: "invalid type", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "invalid-type", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "invalid-type", + MountPrefix + "mount1.share": "pod", }, error: "invalid type", }, { name: "invalid share", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "invalid-share", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "invalid-share", }, error: "invalid share", }, { name: "invalid options", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", - path.Join(MountPrefix, "mount1", "options"): "invalid-option", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + MountPrefix + "mount1.options": "invalid-option", }, error: "unknown mount option", }, { name: "duplicate source", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", - path.Join(MountPrefix, "mount2", "source"): "foo", - path.Join(MountPrefix, "mount2", "type"): "bind", - path.Join(MountPrefix, "mount2", "share"): "container", + MountPrefix + "mount2.source": "foo", + MountPrefix + "mount2.type": "bind", + MountPrefix + "mount2.share": "container", }, error: "have the same mount source", }, @@ -202,36 +201,36 @@ func TestGetMountAccessType(t *testing.T) { { name: "container=exclusive", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): source, - path.Join(MountPrefix, "mount1", "type"): "bind", - path.Join(MountPrefix, "mount1", "share"): "container", + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "container", }, want: FileAccessExclusive, }, { name: "pod=shared", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): source, - path.Join(MountPrefix, "mount1", "type"): "bind", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "pod", }, want: FileAccessShared, }, { name: "shared=shared", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): source, - path.Join(MountPrefix, "mount1", "type"): "bind", - path.Join(MountPrefix, "mount1", "share"): "shared", + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "shared", }, want: FileAccessShared, }, { name: "default=shared", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): source + "mismatch", - path.Join(MountPrefix, "mount1", "type"): "bind", - path.Join(MountPrefix, "mount1", "share"): "container", + MountPrefix + "mount1.source": source + "mismatch", + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "container", }, want: FileAccessShared, }, diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index df6052c88..bc1d0c1bb 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -93,10 +93,6 @@ type Loader struct { // spec is the base configuration for the root container. spec *specs.Spec - // startSignalForwarding enables forwarding of signals to the sandboxed - // container. It should be called after the init process is loaded. - startSignalForwarding func() func() - // stopSignalForwarding disables forwarding of signals to the sandboxed // container. It should be called when a sandbox is destroyed. stopSignalForwarding func() @@ -336,29 +332,6 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("ignore child stop signals failed: %v", err) } - // Handle signals by forwarding them to the root container process - // (except for panic signal, which should cause a panic). - l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) { - // Panic signal should cause a panic. - if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) { - panic("Signal-induced panic") - } - - // Otherwise forward to root container. - deliveryMode := DeliverToProcess - if args.Console { - // Since we are running with a console, we should - // forward the signal to the foreground process group - // so that job control signals like ^C can be handled - // properly. - deliveryMode = DeliverToForegroundProcessGroup - } - log.Infof("Received external signal %d, mode: %v", sig, deliveryMode) - if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil { - log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err) - } - }) - // Create the control server using the provided FD. // // This must be done *after* we have initialized the kernel since the @@ -566,8 +539,27 @@ func (l *Loader) run() error { ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup()) } - // Start signal forwarding only after an init process is created. - l.stopSignalForwarding = l.startSignalForwarding() + // Handle signals by forwarding them to the root container process + // (except for panic signal, which should cause a panic). + l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { + // Panic signal should cause a panic. + if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) { + panic("Signal-induced panic") + } + + // Otherwise forward to root container. + deliveryMode := DeliverToProcess + if l.console { + // Since we are running with a console, we should forward the signal to + // the foreground process group so that job control signals like ^C can + // be handled properly. + deliveryMode = DeliverToForegroundProcessGroup + } + log.Infof("Received external signal %d, mode: %v", sig, deliveryMode) + if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil { + log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err) + } + }) log.Infof("Process should have started...") l.watchdog.Start() diff --git a/runsc/boot/network.go b/runsc/boot/network.go index f98c5fd36..0240fe323 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -80,7 +80,8 @@ type CreateLinksAndRoutesArgs struct { LoopbackLinks []LoopbackLink FDBasedLinks []FDBasedLink - DefaultGateway DefaultRoute + Defaultv4Gateway DefaultRoute + Defaultv6Gateway DefaultRoute } // Empty returns true if route hasn't been set. @@ -122,10 +123,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct nicID++ nicids[link.Name] = nicID - ep := loopback.New() + linkEP := loopback.New() log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) - if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, true /* loopback */); err != nil { + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err } @@ -157,7 +158,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } mac := tcpip.LinkAddress(link.LinkAddress) - ep, err := fdbased.New(&fdbased.Options{ + linkEP, err := fdbased.New(&fdbased.Options{ FDs: FDs, MTU: uint32(link.MTU), EthernetHeader: true, @@ -172,7 +173,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) - if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, false /* loopback */); err != nil { + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err } @@ -186,12 +187,24 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } } - if !args.DefaultGateway.Route.Empty() { - nicID, ok := nicids[args.DefaultGateway.Name] + if !args.Defaultv4Gateway.Route.Empty() { + nicID, ok := nicids[args.Defaultv4Gateway.Name] if !ok { - return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name) + return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name) } - route, err := args.DefaultGateway.Route.toTcpipRoute(nicID) + route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) + } + + if !args.Defaultv6Gateway.Route.Empty() { + nicID, ok := nicids[args.Defaultv6Gateway.Name] + if !ok { + return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name) + } + route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID) if err != nil { return err } @@ -205,15 +218,9 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct // createNICWithAddrs creates a NIC in the network stack and adds the given // addresses. -func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error { - if loopback { - if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil { - return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v) failed: %v", id, name, err) - } - } else { - if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil { - return fmt.Errorf("CreateNamedNIC(%v, %v) failed: %v", id, name, err) - } +func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error { + if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil { + return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, ep, err) } // Always start with an arp address for the NIC. diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index ab3a25b9b..653ca5f52 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -101,6 +101,14 @@ func getValue(path, name string) (string, error) { return string(out), nil } +func getInt(path, name string) (int, error) { + s, err := getValue(path, name) + if err != nil { + return 0, err + } + return strconv.Atoi(strings.TrimSpace(s)) +} + // fillFromAncestor sets the value of a cgroup file from the first ancestor // that has content. It does nothing if the file in 'path' has already been set. func fillFromAncestor(path string) (string, error) { @@ -323,6 +331,22 @@ func (c *Cgroup) Join() (func(), error) { return undo, nil } +func (c *Cgroup) CPUQuota() (float64, error) { + path := c.makePath("cpu") + quota, err := getInt(path, "cpu.cfs_quota_us") + if err != nil { + return -1, err + } + period, err := getInt(path, "cpu.cfs_period_us") + if err != nil { + return -1, err + } + if quota <= 0 || period <= 0 { + return -1, err + } + return float64(quota) / float64(period), nil +} + // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'. func (c *Cgroup) NumCPU() (int, error) { path := c.makePath("cpuset") diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index 7313e473f..f37415810 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -32,16 +32,17 @@ import ( // Debug implements subcommands.Command for the "debug" command. type Debug struct { - pid int - stacks bool - signal int - profileHeap string - profileCPU string - profileDelay int - trace string - strace string - logLevel string - logPackets string + pid int + stacks bool + signal int + profileHeap string + profileCPU string + trace string + strace string + logLevel string + logPackets string + duration time.Duration + ps bool } // Name implements subcommands.Command. @@ -65,12 +66,13 @@ func (d *Debug) SetFlags(f *flag.FlagSet) { f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log") f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") - f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile") + f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles") f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.") f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox") f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`) f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).") f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.") + f.BoolVar(&d.ps, "ps", false, "lists processes") } // Execute implements subcommands.Command.Execute. @@ -163,7 +165,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if err := c.Sandbox.StartCPUProfile(f); err != nil { return Errorf(err.Error()) } - log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU) + log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU) } if d.trace != "" { delay = true @@ -181,8 +183,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if err := c.Sandbox.StartTrace(f); err != nil { return Errorf(err.Error()) } - log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace) - + log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace) } if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 { @@ -241,9 +242,20 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } log.Infof("Logging options changed") } + if d.ps { + pList, err := c.Processes() + if err != nil { + Fatalf("getting processes for container: %v", err) + } + o, err := control.ProcessListToJSON(pList) + if err != nil { + Fatalf("generating JSON: %v", err) + } + log.Infof(o) + } if delay { - time.Sleep(time.Duration(d.profileDelay) * time.Second) + time.Sleep(d.duration) } return subcommands.ExitSuccess diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 7d67c3a75..5ed131a7f 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -28,6 +28,7 @@ import ( "github.com/kr/pty" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/testutil" @@ -219,9 +220,9 @@ func TestJobControlSignalExec(t *testing.T) { // Make sure all the processes are running. expectedPL := []*control.Process{ // Root container process. - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, // Bash from exec process. - {PID: 2, Cmd: "bash"}, + {PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) @@ -231,7 +232,7 @@ func TestJobControlSignalExec(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for it to start. Sleep's PPID is bash's PID. - expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"}) + expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}}) if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) } @@ -361,7 +362,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { // Wait for bash to start. expectedPL := []*control.Process{ - {PID: 1, Cmd: "bash"}, + {PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(c, expectedPL); err != nil { t.Fatal(err) @@ -371,7 +372,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for sleep to start. - expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep"}) + expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}}) if err := waitForProcessList(c, expectedPL); err != nil { t.Fatal(err) } diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 07eacaac0..c10f85992 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -37,6 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/boot/platforms" @@ -52,8 +53,9 @@ func waitForProcessList(cont *Container, want []*control.Process) error { err = fmt.Errorf("error getting process data from container: %v", err) return &backoff.PermanentError{Err: err} } - if !procListsEqual(got, want) { - return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want)) + if r, err := procListsEqual(got, want); !r { + return fmt.Errorf("container got process list: %s, want: %s: error: %v", + procListToString(got), procListToString(want), err) } return nil } @@ -91,22 +93,34 @@ func blockUntilWaitable(pid int) error { // procListsEqual is used to check whether 2 Process lists are equal for all // implemented fields. -func procListsEqual(got, want []*control.Process) bool { +func procListsEqual(got, want []*control.Process) (bool, error) { if len(got) != len(want) { - return false + return false, nil } for i := range got { pd1 := got[i] pd2 := want[i] - // Zero out unimplemented and timing dependant fields. + // Zero out timing dependant fields. pd1.Time = "" pd1.STime = "" pd1.C = 0 - if *pd1 != *pd2 { - return false + // Ignore TTY field too, since it's not relevant in the cases + // where we use this method. Tests that care about the TTY + // field should check for it themselves. + pd1.TTY = "" + pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1}) + if err != nil { + return false, err + } + pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2}) + if err != nil { + return false, err + } + if pd1Json != pd2Json { + return false, nil } } - return true + return true, nil } // getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the @@ -116,7 +130,11 @@ func getAndCheckProcLists(cont *Container, want []*control.Process) error { if err != nil { return fmt.Errorf("error getting process data from container: %v", err) } - if procListsEqual(got, want) { + equal, err := procListsEqual(got, want) + if err != nil { + return err + } + if equal { return nil } return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want)) @@ -288,11 +306,12 @@ func TestLifecycle(t *testing.T) { // expectedPL lists the expected process state of the container. expectedPL := []*control.Process{ { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, }, } // Create the container. @@ -590,18 +609,20 @@ func TestExec(t *testing.T) { // expectedPL lists the expected process state of the container. expectedPL := []*control.Process{ { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, }, { - UID: uid, - PID: 2, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{2}, }, } @@ -1062,18 +1083,20 @@ func TestPauseResume(t *testing.T) { // expectedPL lists the expected process state of the container. expectedPL := []*control.Process{ { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, }, { - UID: uid, - PID: 2, - PPID: 0, - C: 0, - Cmd: "bash", + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "bash", + Threads: []kernel.ThreadID{2}, }, } @@ -1126,11 +1149,12 @@ func TestPauseResume(t *testing.T) { expectedPL2 := []*control.Process{ { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, }, } @@ -1241,18 +1265,20 @@ func TestCapabilities(t *testing.T) { // expectedPL lists the expected process state of the container. expectedPL := []*control.Process{ { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, }, { - UID: uid, - PID: 2, - PPID: 0, - C: 0, - Cmd: "exe", + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "exe", + Threads: []kernel.ThreadID{2}, }, } if err := waitForProcessList(cont, expectedPL[:1]); err != nil { @@ -2112,6 +2138,95 @@ func TestOverlayfsStaleRead(t *testing.T) { } } +// TestTTYField checks TTY field returned by container.Processes(). +func TestTTYField(t *testing.T) { + stop := testutil.StartReaper() + defer stop() + + testApp, err := testutil.FindFile("runsc/container/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + testCases := []struct { + name string + useTTY bool + wantTTYField string + }{ + { + name: "no tty", + useTTY: false, + wantTTYField: "?", + }, + { + name: "tty used", + useTTY: true, + wantTTYField: "pts/0", + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + conf := testutil.TestConfig() + + // We will run /bin/sleep, possibly with an open TTY. + cmd := []string{"/bin/sleep", "10000"} + if test.useTTY { + // Run inside the "pty-runner". + cmd = append([]string{testApp, "pty-runner"}, cmd...) + } + + spec := testutil.NewSpecWithArgs(cmd...) + rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // Create and start the container. + args := Args{ + ID: testutil.UniqueContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Wait for sleep to be running, and check the TTY + // field. + var gotTTYField string + cb := func() error { + ps, err := c.Processes() + if err != nil { + err = fmt.Errorf("error getting process data from container: %v", err) + return &backoff.PermanentError{Err: err} + } + for _, p := range ps { + if strings.Contains(p.Cmd, "sleep") { + gotTTYField = p.TTY + return nil + } + } + return fmt.Errorf("sleep not running") + } + if err := testutil.Poll(cb, 30*time.Second); err != nil { + t.Fatalf("error waiting for sleep process: %v", err) + } + + if gotTTYField != test.wantTTYField { + t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField) + } + }) + } +} + // executeSync synchronously executes a new process. func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { pid, err := cont.Execute(args) diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index a5a62378c..4ad09ceab 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -123,11 +123,11 @@ func execMany(execs []execDesc) error { func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) { for _, spec := range pod { - spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source - spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type - spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod" + spec.Annotations[boot.MountPrefix+name+".source"] = mount.Source + spec.Annotations[boot.MountPrefix+name+".type"] = mount.Type + spec.Annotations[boot.MountPrefix+name+".share"] = "pod" if len(mount.Options) > 0 { - spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",") + spec.Annotations[boot.MountPrefix+name+".options"] = strings.Join(mount.Options, ",") } } } @@ -156,13 +156,13 @@ func TestMultiContainerSanity(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) } expectedPL = []*control.Process{ - {PID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -202,13 +202,13 @@ func TestMultiPIDNS(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) } expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -264,7 +264,7 @@ func TestMultiPIDNSPath(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -274,7 +274,7 @@ func TestMultiPIDNSPath(t *testing.T) { } expectedPL = []*control.Process{ - {PID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -306,7 +306,7 @@ func TestMultiContainerWait(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -351,7 +351,7 @@ func TestMultiContainerWait(t *testing.T) { // After Wait returns, ensure that the root container is running and // the child has finished. expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err) @@ -383,7 +383,7 @@ func TestExecWait(t *testing.T) { // Check via ps that process is running. expectedPL := []*control.Process{ - {PID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Fatalf("failed to wait for sleep to start: %v", err) @@ -418,7 +418,7 @@ func TestExecWait(t *testing.T) { // Wait for the exec'd process to exit. expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Fatalf("failed to wait for second container to stop: %v", err) @@ -505,7 +505,7 @@ func TestMultiContainerSignal(t *testing.T) { // Check via ps that container 1 process is running. expectedPL := []*control.Process{ - {PID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { @@ -519,7 +519,7 @@ func TestMultiContainerSignal(t *testing.T) { // Make sure process 1 is still running. expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -633,9 +633,10 @@ func TestMultiContainerDestroy(t *testing.T) { if err != nil { t.Fatalf("error getting process data from sandbox: %v", err) } - expectedPL := []*control.Process{{PID: 1, Cmd: "sleep"}} - if !procListsEqual(pss, expectedPL) { - t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL)) + expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}} + if r, err := procListsEqual(pss, expectedPL); !r { + t.Errorf("container got process list: %s, want: %s: error: %v", + procListToString(pss), procListToString(expectedPL), err) } // Check that cont.Destroy is safe to call multiple times. @@ -669,7 +670,7 @@ func TestMultiContainerProcesses(t *testing.T) { // Check root's container process list doesn't include other containers. expectedPL0 := []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL0); err != nil { t.Errorf("failed to wait for process to start: %v", err) @@ -677,8 +678,8 @@ func TestMultiContainerProcesses(t *testing.T) { // Same for the other container. expectedPL1 := []*control.Process{ - {PID: 2, Cmd: "sh"}, - {PID: 3, PPID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}}, + {PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}}, } if err := waitForProcessList(containers[1], expectedPL1); err != nil { t.Errorf("failed to wait for process to start: %v", err) @@ -692,7 +693,7 @@ func TestMultiContainerProcesses(t *testing.T) { if _, err := containers[1].Execute(args); err != nil { t.Fatalf("error exec'ing: %v", err) } - expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep"}) + expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}}) if err := waitForProcessList(containers[1], expectedPL1); err != nil { t.Errorf("failed to wait for process to start: %v", err) } @@ -1513,7 +1514,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { // Ensure container is running c := containers[2] expectedPL := []*control.Process{ - {PID: 3, Cmd: "sleep"}, + {PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}}, } if err := waitForProcessList(c, expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -1541,7 +1542,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { continue // container[2] has been killed. } pl := []*control.Process{ - {PID: kernel.ThreadID(i + 1), Cmd: "sleep"}, + {PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}}, } if err := waitForProcessList(c, pl); err != nil { t.Errorf("Container %q was affected by another container: %v", c.ID, err) @@ -1561,7 +1562,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { // Wait until sandbox stops. waitForProcessList will loop until sandbox exits // and RPC errors out. impossiblePL := []*control.Process{ - {PID: 100, Cmd: "non-existent-process"}, + {PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}}, } if err := waitForProcessList(c, impossiblePL); err == nil { t.Fatalf("Sandbox was not killed after gofer death") diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD index 9bf9e6e9d..bfd338bb6 100644 --- a/runsc/container/test_app/BUILD +++ b/runsc/container/test_app/BUILD @@ -15,5 +15,6 @@ go_binary( "//pkg/unet", "//runsc/testutil", "@com_github_google_subcommands//:go_default_library", + "@com_github_kr_pty//:go_default_library", ], ) diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go index 913d781c6..a1c8a741a 100644 --- a/runsc/container/test_app/test_app.go +++ b/runsc/container/test_app/test_app.go @@ -19,6 +19,7 @@ package main import ( "context" "fmt" + "io" "io/ioutil" "log" "net" @@ -31,6 +32,7 @@ import ( "flag" "github.com/google/subcommands" + "github.com/kr/pty" "gvisor.dev/gvisor/runsc/testutil" ) @@ -41,6 +43,7 @@ func main() { subcommands.Register(new(fdReceiver), "") subcommands.Register(new(fdSender), "") subcommands.Register(new(forkBomb), "") + subcommands.Register(new(ptyRunner), "") subcommands.Register(new(reaper), "") subcommands.Register(new(syscall), "") subcommands.Register(new(taskTree), "") @@ -352,3 +355,40 @@ func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...inter return subcommands.ExitSuccess } + +type ptyRunner struct{} + +// Name implements subcommands.Command. +func (*ptyRunner) Name() string { + return "pty-runner" +} + +// Synopsis implements subcommands.Command. +func (*ptyRunner) Synopsis() string { + return "runs the given command with an open pty terminal" +} + +// Usage implements subcommands.Command. +func (*ptyRunner) Usage() string { + return "pty-runner [command]" +} + +// SetFlags implements subcommands.Command.SetFlags. +func (*ptyRunner) SetFlags(f *flag.FlagSet) {} + +// Execute implements subcommands.Command. +func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus { + c := exec.Command(fs.Args()[0], fs.Args()[1:]...) + f, err := pty.Start(c) + if err != nil { + fmt.Printf("pty.Start failed: %v", err) + return subcommands.ExitFailure + } + defer f.Close() + + // Copy stdout from the command to keep this process alive until the + // subprocess exits. + io.Copy(os.Stdout, f) + + return subcommands.ExitSuccess +} diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go index 57f6ae8de..9b6346ca2 100644 --- a/runsc/dockerutil/dockerutil.go +++ b/runsc/dockerutil/dockerutil.go @@ -380,6 +380,16 @@ func (d *Docker) FindPort(sandboxPort int) (int, error) { return port, nil } +// FindIP returns the IP address of the container as a string. +func (d *Docker) FindIP() (string, error) { + const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}` + out, err := do("inspect", "-f", format, d.Name) + if err != nil { + return "", fmt.Errorf("error retrieving IP: %v", err) + } + return strings.TrimSpace(out), nil +} + // SandboxPid returns the PID to the sandbox process. func (d *Docker) SandboxPid() (int, error) { out, err := do("inspect", "-f={{.State.Pid}}", d.Name) diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 9117d9616..b59e1a70e 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -199,6 +199,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID { // The reason that the file is not opened initially as read-write is for better // performance with 'overlay2' storage driver. overlay2 eagerly copies the // entire file up when it's opened in write mode, and would perform badly when +// multiple files are only being opened for read (esp. startup). type localFile struct { p9.DefaultWalkGetAttr @@ -956,14 +957,14 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) { } func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) { + var dirents []p9.Dirent + // Limit 'count' to cap the slice size that is returned. const maxCount = 100000 if count > maxCount { count = maxCount } - dirents := make([]p9.Dirent, 0, count) - // Pre-allocate buffers that will be reused to get partial results. direntsBuf := make([]byte, 8192) names := make([]string, 0, 100) diff --git a/runsc/main.go b/runsc/main.go index 4682b308c..abf929511 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -82,6 +82,7 @@ var ( numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.") + cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") // Test flags, not to be used outside tests, ever. testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") @@ -225,6 +226,7 @@ func main() { AlsoLogToStderr: *alsoLogToStderr, ReferenceLeakMode: refsLeakMode, OverlayfsStaleRead: *overlayfsStaleRead, + CPUNumFromQuota: *cpuNumFromQuota, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, TestOnlyTestNameEnv: *testOnlyTestNameEnv, diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 27459e6d1..8001949d5 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -19,6 +19,7 @@ go_library( "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/platform", + "//pkg/tcpip/header", "//pkg/tcpip/stack", "//pkg/urpc", "//runsc/boot", diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index d42de0176..be8b72b3e 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -28,6 +28,7 @@ import ( "github.com/vishvananda/netlink" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" @@ -183,36 +184,39 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG continue } - // Keep only IPv4 addresses. - var ip4addrs []*net.IPNet + var ipAddrs []*net.IPNet for _, ifaddr := range allAddrs { ipNet, ok := ifaddr.(*net.IPNet) if !ok { return fmt.Errorf("address is not IPNet: %+v", ifaddr) } - if ipNet.IP.To4() == nil { - log.Warningf("IPv6 is not supported, skipping: %v", ipNet) - continue - } - ip4addrs = append(ip4addrs, ipNet) + ipAddrs = append(ipAddrs, ipNet) } - if len(ip4addrs) == 0 { - log.Warningf("No IPv4 address found for interface %q, skipping", iface.Name) + if len(ipAddrs) == 0 { + log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name) continue } // Scrape the routes before removing the address, since that // will remove the routes as well. - routes, def, err := routesForIface(iface) + routes, defv4, defv6, err := routesForIface(iface) if err != nil { return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) } - if def != nil { - if !args.DefaultGateway.Route.Empty() { - return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway) + if defv4 != nil { + if !args.Defaultv4Gateway.Route.Empty() { + return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) } - args.DefaultGateway.Route = *def - args.DefaultGateway.Name = iface.Name + args.Defaultv4Gateway.Route = *defv4 + args.Defaultv4Gateway.Name = iface.Name + } + + if defv6 != nil { + if !args.Defaultv6Gateway.Route.Empty() { + return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) + } + args.Defaultv6Gateway.Route = *defv6 + args.Defaultv6Gateway.Name = iface.Name } link := boot.FDBasedLink{ @@ -247,6 +251,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG } args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) } + if link.GSOMaxSize == 0 && softwareGSO { // Hardware GSO is disabled. Let's enable software GSO. link.GSOMaxSize = stack.SoftwareGSOMaxSize @@ -255,7 +260,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG // Collect the addresses for the interface, enable forwarding, // and remove them from the host. - for _, addr := range ip4addrs { + for _, addr := range ipAddrs { link.Addresses = append(link.Addresses, addr.IP) // Steal IP address from NIC. @@ -351,46 +356,56 @@ func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, } // routesForIface iterates over all routes for the given interface and converts -// them to boot.Routes. -func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) { +// them to boot.Routes. It also returns the a default v4/v6 route if found. +func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) { link, err := netlink.LinkByIndex(iface.Index) if err != nil { - return nil, nil, err + return nil, nil, nil, err } rs, err := netlink.RouteList(link, netlink.FAMILY_ALL) if err != nil { - return nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err) + return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err) } - var def *boot.Route + var defv4, defv6 *boot.Route var routes []boot.Route for _, r := range rs { // Is it a default route? if r.Dst == nil { if r.Gw == nil { - return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r) - } - if r.Gw.To4() == nil { - log.Warningf("IPv6 is not supported, skipping default route: %v", r) - continue - } - if def != nil { - return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r) + return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r) } // Create a catch all route to the gateway. - def = &boot.Route{ - Destination: net.IPNet{ - IP: net.IPv4zero, - Mask: net.IPMask(net.IPv4zero), - }, - Gateway: r.Gw, + switch len(r.Gw) { + case header.IPv4AddressSize: + if defv4 != nil { + return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r) + } + defv4 = &boot.Route{ + Destination: net.IPNet{ + IP: net.IPv4zero, + Mask: net.IPMask(net.IPv4zero), + }, + Gateway: r.Gw, + } + case header.IPv6AddressSize: + if defv6 != nil { + return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r) + } + + defv6 = &boot.Route{ + Destination: net.IPNet{ + IP: net.IPv6zero, + Mask: net.IPMask(net.IPv6zero), + }, + Gateway: r.Gw, + } + default: + return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r) } continue } - if r.Dst.IP.To4() == nil { - log.Warningf("IPv6 is not supported, skipping route: %v", r) - continue - } + dst := *r.Dst dst.IP = dst.IP.Mask(dst.Mask) routes = append(routes, boot.Route{ @@ -398,7 +413,7 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) { Gateway: r.Gw, }) } - return routes, def, nil + return routes, defv4, defv6, nil } // removeAddress removes IP address from network device. It's equivalent to: diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index ee9327fc8..ce1452b87 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -18,6 +18,7 @@ package sandbox import ( "context" "fmt" + "math" "os" "os/exec" "strconv" @@ -631,6 +632,26 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF if err != nil { return fmt.Errorf("getting cpu count from cgroups: %v", err) } + if conf.CPUNumFromQuota { + // Dropping below 2 CPUs can trigger application to disable + // locks that can lead do hard to debug errors, so just + // leaving two cores as reasonable default. + const minCPUs = 2 + + quota, err := s.Cgroup.CPUQuota() + if err != nil { + return fmt.Errorf("getting cpu qouta from cgroups: %v", err) + } + if n := int(math.Ceil(quota)); n > 0 { + if n < minCPUs { + n = minCPUs + } + if n < cpuNum { + // Only lower the cpu number. + cpuNum = n + } + } + } cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum)) mem, err := s.Cgroup.MemoryLimit() @@ -1004,16 +1025,22 @@ func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error { // DestroyContainer destroys the given container. If it is the root container, // then the entire sandbox is destroyed. func (s *Sandbox) DestroyContainer(cid string) error { + if err := s.destroyContainer(cid); err != nil { + // If the sandbox isn't running, the container has already been destroyed, + // ignore the error in this case. + if s.IsRunning() { + return err + } + } + return nil +} + +func (s *Sandbox) destroyContainer(cid string) error { if s.IsRootContainer(cid) { log.Debugf("Destroying root container %q by destroying sandbox", cid) return s.destroy() } - if !s.IsRunning() { - // Sandbox isn't running anymore, container is already destroyed. - return nil - } - log.Debugf("Destroying container %q in sandbox %q", cid, s.ID) conn, err := s.sandboxConnect() if err != nil { diff --git a/scripts/benchmarks.sh b/scripts/benchmarks.sh new file mode 100755 index 000000000..6b9065b07 --- /dev/null +++ b/scripts/benchmarks.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#!/usr/bin/env bash + +if [ "$#" -lt "1" ]; then + echo "usage: $0 <--mock |--env=<filename>> ..." + echo "example: $0 --mock --runs=8" + exit 1 +fi + +source $(dirname $0)/common.sh + +readonly TIMESTAMP=`date "+%Y%m%d-%H%M%S"` +readonly OUTDIR="$(mktemp --tmpdir -d run-${TIMESTAMP}-XXX)" +readonly DEFAULT_RUNTIMES="--runtime=runc --runtime=runsc --runtime=runsc-kvm" +readonly ALL_RUNTIMES="--runtime=runc --runtime=runsc --runtime=runsc-kvm" + +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'fio.(read|write)' --metric=bandwidth --size=5g --ioengine=sync --blocksize=1m > "${OUTDIR}/fio.csv" +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} fio.rand --metric=bandwidth --size=5g --ioengine=sync --blocksize=4k --time=30 > "${OUTDIR}/tmp_fio.csv" +cat "${OUTDIR}/tmp_fio.csv" | grep "\(runc\|runsc\)" >> "${OUTDIR}/fio.csv" && rm "${OUTDIR}/tmp_fio.csv" + +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'fio.(read|write)' --metric=bandwidth --tmpfs=True --size=5g --ioengine=sync --blocksize=1m > "${OUTDIR}/fio-tmpfs.csv" +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} fio.rand --metric=bandwidth --tmpfs=True --size=5g --ioengine=sync --blocksize=4k --time=30 > "${OUTDIR}/tmp_fio.csv" +cat "${OUTDIR}/tmp_fio.csv" | grep "\(runc\|runsc\)" >> "${OUTDIR}/fio-tmpfs.csv" && rm "${OUTDIR}/tmp_fio.csv" + +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} startup --count=50 > "${OUTDIR}/startup.csv" +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} density > "${OUTDIR}/density.csv" + +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} sysbench.cpu --threads=1 --max_prime=50000 --options='--max-time=5' > "${OUTDIR}/sysbench-cpu.csv" +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} sysbench.memory --threads=1 --options='--memory-block-size=1M --memory-total-size=500G' > "${OUTDIR}/sysbench-memory.csv" +run //benchmarks:perf -- run "$@" ${ALL_RUNTIMES} syscall > "${OUTDIR}/syscall.csv" +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'network.(upload|download)' --runs=20 > "${OUTDIR}/iperf.csv" +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} ml.tensorflow > "${OUTDIR}/tensorflow.csv" +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} media.ffmpeg > "${OUTDIR}/ffmpeg.csv" +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} http.httpd --path=latin100k.txt --connections=1 --connections=5 --connections=10 --connections=25 > "${OUTDIR}/httpd100k.csv" +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} http.httpd --path=latin10240k.txt --connections=1 --connections=5 --connections=10 --connections=25 > "${OUTDIR}/httpd10240k.csv" +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} redis > "${OUTDIR}/redis.csv" +run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'http.(ruby|node)' > "${OUTDIR}/applications.csv" + +echo "${OUTPUT}" && exit 0 diff --git a/scripts/dev.sh b/scripts/dev.sh index c67003018..6238b4d0b 100755 --- a/scripts/dev.sh +++ b/scripts/dev.sh @@ -54,9 +54,10 @@ declare OUTPUT="$(build //runsc)" if [[ ${REFRESH} -eq 0 ]]; then install_runsc "${RUNTIME}" --net-raw install_runsc "${RUNTIME}-d" --net-raw --debug --strace --log-packets + install_runsc "${RUNTIME}-p" --net-raw --profile echo - echo "Runtimes ${RUNTIME} and ${RUNTIME}-d (debug enabled) setup." + echo "Runtimes ${RUNTIME}, ${RUNTIME}-d (debug enabled), and ${RUNTIME}-p installed." echo "Use --runtime="${RUNTIME}" with your Docker command." echo " docker run --rm --runtime="${RUNTIME}" hello-world" echo diff --git a/scripts/go.sh b/scripts/go.sh index 0dbfb7747..626ed8fa4 100755 --- a/scripts/go.sh +++ b/scripts/go.sh @@ -25,6 +25,8 @@ tools/go_branch.sh # Checkout the new branch. git checkout go && git clean -f +go version + # Build everything. go build ./... diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh new file mode 100755 index 000000000..c47cbd675 --- /dev/null +++ b/scripts/iptables_tests.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Copyright 2018 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +source $(dirname $0)/common.sh + +install_runsc_for_test iptables + +# Build the docker image for the test. +run //test/iptables/runner --norun + +# TODO(gvisor.dev/issue/170): Also test this on runsc once iptables are better +# supported +test //test/iptables:iptables_test "--test_arg=--runtime=runc" \ + "--test_arg=--image=bazel/test/iptables/runner:runner" diff --git a/scripts/simple_tests.sh b/scripts/simple_tests.sh index 585216aae..3a15050c2 100755 --- a/scripts/simple_tests.sh +++ b/scripts/simple_tests.sh @@ -17,4 +17,4 @@ source $(dirname $0)/common.sh # Run all simple tests (locally). -test //pkg/... //runsc/... //tools/... +test //pkg/... //runsc/... //tools/... //benchmarks/... //benchmarks/runner:runner_test diff --git a/test/iptables/BUILD b/test/iptables/BUILD new file mode 100644 index 000000000..fa833c3b2 --- /dev/null +++ b/test/iptables/BUILD @@ -0,0 +1,31 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "iptables", + srcs = [ + "filter_input.go", + "iptables.go", + "iptables_util.go", + ], + importpath = "gvisor.dev/gvisor/test/iptables", + visibility = ["//test/iptables:__subpackages__"], +) + +go_test( + name = "iptables_test", + srcs = [ + "iptables_test.go", + ], + embed = [":iptables"], + tags = [ + "local", + "manual", + ], + deps = [ + "//pkg/log", + "//runsc/dockerutil", + "//runsc/testutil", + ], +) diff --git a/test/iptables/README.md b/test/iptables/README.md new file mode 100644 index 000000000..9f8e34420 --- /dev/null +++ b/test/iptables/README.md @@ -0,0 +1,44 @@ +# iptables Tests + +iptables tests are run via `scripts/iptables_test.sh`. + +## Test Structure + +Each test implements `TestCase`, providing (1) a function to run inside the +container and (2) a function to run locally. Those processes are given each +others' IP addresses. The test succeeds when both functions succeed. + +The function inside the container (`ContainerAction`) typically sets some +iptables rules and then tries to send or receive packets. The local function +(`LocalAction`) will typically just send or receive packets. + +### Adding Tests + +1) Add your test to the `iptables` package. + +2) Register the test in an `init` function via `RegisterTestCase` (see +`filter_input.go` as an example). + +3) Add it to `iptables_test.go` (see the other tests in that file). + +Your test is now runnable with bazel! + +## Run individual tests + +Build the testing Docker container: + +```bash +$ bazel run //test/iptables/runner -- --norun +``` + +Run an individual test via: + +```bash +$ bazel test //test/iptables:iptables_test --test_filter=<TESTNAME> +``` + +To run an individual test with `runc`: + +```bash +$ bazel test //test/iptables:iptables_test --test_filter=<TESTNAME> --test_arg=--runtime=runc +``` diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go new file mode 100644 index 000000000..923f44e68 --- /dev/null +++ b/test/iptables/filter_input.go @@ -0,0 +1,124 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iptables + +import ( + "fmt" + "net" + "time" +) + +const ( + dropPort = 2401 + acceptPort = 2402 + sendloopDuration = 2 * time.Second + network = "udp4" +) + +func init() { + RegisterTestCase(FilterInputDropUDP{}) + RegisterTestCase(FilterInputDropUDPPort{}) + RegisterTestCase(FilterInputDropDifferentUDPPort{}) +} + +// FilterInputDropUDP tests that we can drop UDP traffic. +type FilterInputDropUDP struct{} + +// Name implements TestCase.Name. +func (FilterInputDropUDP) Name() string { + return "FilterInputDropUDP" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterInputDropUDP) ContainerAction(ip net.IP) error { + if err := filterTable("-A", "INPUT", "-p", "udp", "-j", "DROP"); err != nil { + return err + } + + // Listen for UDP packets on dropPort. + if err := listenUDP(dropPort, sendloopDuration); err == nil { + return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort) + } else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() { + return fmt.Errorf("error reading: %v", err) + } + + // At this point we know that reading timed out and never received a + // packet. + return nil +} + +// LocalAction implements TestCase.LocalAction. +func (FilterInputDropUDP) LocalAction(ip net.IP) error { + return sendUDPLoop(ip, dropPort, sendloopDuration) +} + +// FilterInputDropUDPPort tests that we can drop UDP traffic by port. +type FilterInputDropUDPPort struct{} + +// Name implements TestCase.Name. +func (FilterInputDropUDPPort) Name() string { + return "FilterInputDropUDPPort" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterInputDropUDPPort) ContainerAction(ip net.IP) error { + if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil { + return err + } + + // Listen for UDP packets on dropPort. + if err := listenUDP(dropPort, sendloopDuration); err == nil { + return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort) + } else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() { + return fmt.Errorf("error reading: %v", err) + } + + // At this point we know that reading timed out and never received a + // packet. + return nil +} + +// LocalAction implements TestCase.LocalAction. +func (FilterInputDropUDPPort) LocalAction(ip net.IP) error { + return sendUDPLoop(ip, dropPort, sendloopDuration) +} + +// FilterInputDropDifferentUDPPort tests that dropping traffic for a single UDP port +// doesn't drop packets on other ports. +type FilterInputDropDifferentUDPPort struct{} + +// Name implements TestCase.Name. +func (FilterInputDropDifferentUDPPort) Name() string { + return "FilterInputDropDifferentUDPPort" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterInputDropDifferentUDPPort) ContainerAction(ip net.IP) error { + if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil { + return err + } + + // Listen for UDP packets on another port. + if err := listenUDP(acceptPort, sendloopDuration); err != nil { + return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", acceptPort, err) + } + + return nil +} + +// LocalAction implements TestCase.LocalAction. +func (FilterInputDropDifferentUDPPort) LocalAction(ip net.IP) error { + return sendUDPLoop(ip, acceptPort, sendloopDuration) +} diff --git a/test/iptables/iptables.go b/test/iptables/iptables.go new file mode 100644 index 000000000..2e565d988 --- /dev/null +++ b/test/iptables/iptables.go @@ -0,0 +1,53 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package iptables contains a set of iptables tests implemented as TestCases +package iptables + +import ( + "fmt" + "net" +) + +// IPExchangePort is the port the container listens on to receive the IP +// address of the local process. +const IPExchangePort = 2349 + +// A TestCase contains one action to run in the container and one to run +// locally. The actions run concurrently and each must succeed for the test +// pass. +type TestCase interface { + // Name returns the name of the test. + Name() string + + // ContainerAction runs inside the container. It receives the IP of the + // local process. + ContainerAction(ip net.IP) error + + // LocalAction runs locally. It receives the IP of the container. + LocalAction(ip net.IP) error +} + +// Tests maps test names to TestCase. +// +// New TestCases are added by calling RegisterTestCase in an init function. +var Tests = map[string]TestCase{} + +// RegisterTestCase registers tc so it can be run. +func RegisterTestCase(tc TestCase) { + if _, ok := Tests[tc.Name()]; ok { + panic(fmt.Sprintf("TestCase %s already registered.", tc.Name())) + } + Tests[tc.Name()] = tc +} diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go new file mode 100644 index 000000000..bfbf1bb87 --- /dev/null +++ b/test/iptables/iptables_test.go @@ -0,0 +1,179 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iptables + +import ( + "fmt" + "net" + "os" + "path" + "testing" + "time" + + "flag" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/runsc/dockerutil" + "gvisor.dev/gvisor/runsc/testutil" +) + +const timeout time.Duration = 10 * time.Second + +var image = flag.String("image", "bazel/test/iptables/runner:runner", "image to run tests in") + +type result struct { + output string + err error +} + +// singleTest runs a TestCase. Each test follows a pattern: +// - Create a container. +// - Get the container's IP. +// - Send the container our IP. +// - Start a new goroutine running the local action of the test. +// - Wait for both the container and local actions to finish. +// +// Container output is logged to $TEST_UNDECLARED_OUTPUTS_DIR if it exists, or +// to stderr. +func singleTest(test TestCase) error { + if _, ok := Tests[test.Name()]; !ok { + return fmt.Errorf("no test found with name %q. Has it been registered?", test.Name()) + } + + // Create and start the container. + cont := dockerutil.MakeDocker("gvisor-iptables") + defer cont.CleanUp() + resultChan := make(chan *result) + go func() { + output, err := cont.RunFg("--cap-add=NET_ADMIN", *image, "-name", test.Name()) + logContainer(output, err) + resultChan <- &result{output, err} + }() + + // Get the container IP. + ip, err := getIP(cont) + if err != nil { + return fmt.Errorf("failed to get container IP: %v", err) + } + + // Give the container our IP. + if err := sendIP(ip); err != nil { + return fmt.Errorf("failed to send IP to container: %v", err) + } + + // Run our side of the test. + errChan := make(chan error) + go func() { + errChan <- test.LocalAction(ip) + }() + + // Wait for both the container and local tests to finish. + var res *result + to := time.After(timeout) + for localDone := false; res == nil || !localDone; { + select { + case res = <-resultChan: + log.Infof("Container finished.") + case err, localDone = <-errChan: + log.Infof("Local finished.") + if err != nil { + return fmt.Errorf("local test failed: %v", err) + } + case <-to: + return fmt.Errorf("timed out after %f seconds", timeout.Seconds()) + } + } + + return res.err +} + +func getIP(cont dockerutil.Docker) (net.IP, error) { + // The container might not have started yet, so retry a few times. + var ipStr string + to := time.After(timeout) + for ipStr == "" { + ipStr, _ = cont.FindIP() + select { + case <-to: + return net.IP{}, fmt.Errorf("timed out getting IP after %f seconds", timeout.Seconds()) + default: + time.Sleep(250 * time.Millisecond) + } + } + ip := net.ParseIP(ipStr) + if ip == nil { + return net.IP{}, fmt.Errorf("invalid IP: %q", ipStr) + } + log.Infof("Container has IP of %s", ipStr) + return ip, nil +} + +func sendIP(ip net.IP) error { + contAddr := net.TCPAddr{ + IP: ip, + Port: IPExchangePort, + } + var conn *net.TCPConn + // The container may not be listening when we first connect, so retry + // upon error. + cb := func() error { + c, err := net.DialTCP("tcp4", nil, &contAddr) + conn = c + return err + } + if err := testutil.Poll(cb, timeout); err != nil { + return fmt.Errorf("timed out waiting to send IP, most recent error: %v", err) + } + if _, err := conn.Write([]byte{0}); err != nil { + return fmt.Errorf("error writing to container: %v", err) + } + return nil +} + +func logContainer(output string, err error) { + msg := fmt.Sprintf("Container error: %v\nContainer output:\n%v", err, output) + if artifactsDir := os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); artifactsDir != "" { + fpath := path.Join(artifactsDir, "container.log") + if file, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE, 0644); err != nil { + log.Warningf("Failed to open log file %q: %v", fpath, err) + } else { + defer file.Close() + if _, err := file.Write([]byte(msg)); err == nil { + return + } + log.Warningf("Failed to write to log file %s: %v", fpath, err) + } + } + + // We couldn't write to the output directory -- just log to stderr. + log.Infof(msg) +} + +func TestFilterInputDropUDP(t *testing.T) { + if err := singleTest(FilterInputDropUDP{}); err != nil { + t.Fatal(err) + } +} + +func TestFilterInputDropUDPPort(t *testing.T) { + if err := singleTest(FilterInputDropUDPPort{}); err != nil { + t.Fatal(err) + } +} + +func TestFilterInputDropDifferentUDPPort(t *testing.T) { + if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil { + t.Fatal(err) + } +} diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go new file mode 100644 index 000000000..3a4d11f1a --- /dev/null +++ b/test/iptables/iptables_util.go @@ -0,0 +1,82 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iptables + +import ( + "fmt" + "net" + "os/exec" + "time" +) + +const iptablesBinary = "iptables" + +// filterTable calls `iptables -t filter` with the given args. +func filterTable(args ...string) error { + args = append([]string{"-t", "filter"}, args...) + cmd := exec.Command(iptablesBinary, args...) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("error running iptables with args %v\nerror: %v\noutput: %s", args, err, string(out)) + } + return nil +} + +// listenUDP listens on a UDP port and returns the value of net.Conn.Read() for +// the first read on that port. +func listenUDP(port int, timeout time.Duration) error { + localAddr := net.UDPAddr{ + Port: port, + } + conn, err := net.ListenUDP(network, &localAddr) + if err != nil { + return err + } + defer conn.Close() + conn.SetDeadline(time.Now().Add(timeout)) + _, err = conn.Read([]byte{0}) + return err +} + +// sendUDPLoop sends 1 byte UDP packets repeatedly to the IP and port specified +// over a duration. +func sendUDPLoop(ip net.IP, port int, duration time.Duration) error { + // Send packets for a few seconds. + remote := net.UDPAddr{ + IP: ip, + Port: port, + } + conn, err := net.DialUDP(network, nil, &remote) + if err != nil { + return err + } + defer conn.Close() + + to := time.After(duration) + for timedOut := false; !timedOut; { + // This may return an error (connection refused) if the remote + // hasn't started listening yet or they're dropping our + // packets. So we ignore Write errors and depend on the remote + // to report a failure if it doesn't get a packet it needs. + conn.Write([]byte{0}) + select { + case <-to: + timedOut = true + default: + time.Sleep(200 * time.Millisecond) + } + } + + return nil +} diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD new file mode 100644 index 000000000..c6c42d870 --- /dev/null +++ b/test/iptables/runner/BUILD @@ -0,0 +1,16 @@ +load("@io_bazel_rules_docker//go:image.bzl", "go_image") +load("@io_bazel_rules_docker//container:container.bzl", "container_image") + +package(licenses = ["notice"]) + +container_image( + name = "iptables-base", + base = "@iptables-test//image", +) + +go_image( + name = "runner", + srcs = ["main.go"], + base = ":iptables-base", + deps = ["//test/iptables"], +) diff --git a/test/iptables/runner/Dockerfile b/test/iptables/runner/Dockerfile new file mode 100644 index 000000000..b77db44a1 --- /dev/null +++ b/test/iptables/runner/Dockerfile @@ -0,0 +1,4 @@ +# This Dockerfile builds the image hosted at +# gcr.io/gvisor-presubmit/iptables-test. +FROM ubuntu +RUN apt update && apt install -y iptables diff --git a/test/iptables/runner/main.go b/test/iptables/runner/main.go new file mode 100644 index 000000000..3c794114e --- /dev/null +++ b/test/iptables/runner/main.go @@ -0,0 +1,70 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package main runs iptables tests from within a docker container. +package main + +import ( + "flag" + "fmt" + "log" + "net" + + "gvisor.dev/gvisor/test/iptables" +) + +var name = flag.String("name", "", "name of the test to run") + +func main() { + flag.Parse() + + // Find out which test we're running. + test, ok := iptables.Tests[*name] + if !ok { + log.Fatalf("No test found named %q", *name) + } + log.Printf("Running test %q", *name) + + // Get the IP of the local process. + ip, err := getIP() + if err != nil { + log.Fatal(err) + } + + // Run the test. + if err := test.ContainerAction(ip); err != nil { + log.Fatalf("Failed running test %q: %v", *name, err) + } +} + +// getIP listens for a connection from the local process and returns the source +// IP of that connection. +func getIP() (net.IP, error) { + localAddr := net.TCPAddr{ + Port: iptables.IPExchangePort, + } + listener, err := net.ListenTCP("tcp4", &localAddr) + if err != nil { + return net.IP{}, fmt.Errorf("failed listening for IP: %v", err) + } + defer listener.Close() + conn, err := listener.AcceptTCP() + if err != nil { + return net.IP{}, fmt.Errorf("failed accepting IP: %v", err) + } + defer conn.Close() + log.Printf("Connected to %v", conn.RemoteAddr()) + + return conn.RemoteAddr().(*net.TCPAddr).IP, nil +} diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 722d14b53..a3a85917d 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -376,6 +376,8 @@ syscall_test( syscall_test(test = "//test/syscalls/linux:rlimits_test") +syscall_test(test = "//test/syscalls/linux:rseq_test") + syscall_test(test = "//test/syscalls/linux:rtsignal_test") syscall_test(test = "//test/syscalls/linux:sched_test") @@ -669,6 +671,7 @@ syscall_test(test = "//test/syscalls/linux:udp_bind_test") syscall_test( size = "medium", + add_hostinet = True, shard_count = 10, test = "//test/syscalls/linux:udp_socket_test", ) @@ -714,6 +717,11 @@ syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test") syscall_test(test = "//test/syscalls/linux:proc_net_udp_test") +syscall_test( + add_overlay = True, + test = "//test/syscalls/linux:xattr_test", +) + go_binary( name = "syscall_test_runner", testonly = 1, diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl index dcf5b73ed..aaf77c65b 100644 --- a/test/syscalls/build_defs.bzl +++ b/test/syscalls/build_defs.bzl @@ -9,6 +9,7 @@ def syscall_test( use_tmpfs = False, add_overlay = False, add_uds_tree = False, + add_hostinet = False, tags = None): _syscall_test( test = test, @@ -65,6 +66,18 @@ def syscall_test( file_access = "shared", ) + if add_hostinet: + _syscall_test( + test = test, + shard_count = shard_count, + size = size, + platform = "ptrace", + use_tmpfs = use_tmpfs, + network = "host", + add_uds_tree = add_uds_tree, + tags = tags, + ) + def _syscall_test( test, shard_count, @@ -72,6 +85,7 @@ def _syscall_test( platform, use_tmpfs, tags, + network = "none", file_access = "exclusive", overlay = False, add_uds_tree = False): @@ -85,6 +99,8 @@ def _syscall_test( name += "_shared" if overlay: name += "_overlay" + if network != "none": + name += "_" + network + "net" if tags == None: tags = [] @@ -107,6 +123,7 @@ def _syscall_test( # Arguments are passed directly to syscall_test_runner binary. "--test-name=" + test_name, "--platform=" + platform, + "--network=" + network, "--use-tmpfs=" + str(use_tmpfs), "--file-access=" + file_access, "--overlay=" + str(overlay), diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 6345ea28c..064ce8429 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -6,6 +6,19 @@ package( licenses = ["notice"], ) +exports_files( + [ + "socket.cc", + "socket_inet_loopback.cc", + "socket_ip_loopback_blocking.cc", + "socket_ip_tcp_loopback.cc", + "socket_ipv4_udp_unbound_loopback.cc", + "tcp_socket.cc", + "udp_socket.cc", + ], + visibility = ["//:sandbox"], +) + cc_binary( name = "sigaltstack_check", testonly = 1, @@ -743,6 +756,7 @@ cc_binary( "//test/util:eventfd_util", "//test/util:multiprocess_util", "//test/util:posix_error", + "//test/util:save_util", "//test/util:temp_path", "//test/util:test_util", "//test/util:timer_util", @@ -1795,7 +1809,6 @@ cc_binary( name = "readv_socket_test", testonly = 1, srcs = [ - "file_base.h", "readv_common.cc", "readv_common.h", "readv_socket.cc", @@ -1843,6 +1856,22 @@ cc_binary( ) cc_binary( + name = "rseq_test", + testonly = 1, + srcs = ["rseq.cc"], + data = ["//test/syscalls/linux/rseq"], + linkstatic = 1, + deps = [ + "//test/syscalls/linux/rseq:lib", + "//test/util:logging", + "//test/util:multiprocess_util", + "//test/util:test_main", + "//test/util:test_util", + "@com_google_googletest//:gtest", + ], +) + +cc_binary( name = "rtsignal_test", testonly = 1, srcs = ["rtsignal.cc"], @@ -2142,6 +2171,7 @@ cc_library( ":socket_test_util", "//test/util:test_util", "//test/util:thread_util", + "@com_google_absl//absl/time", "@com_google_googletest//:gtest", ], alwayslink = 1, @@ -3245,8 +3275,6 @@ cc_binary( testonly = 1, srcs = ["tcp_socket.cc"], linkstatic = 1, - # FIXME(b/135470853) - tags = ["flaky"], deps = [ ":socket_test_util", "//test/util:file_descriptor", @@ -3351,11 +3379,15 @@ cc_binary( ], ) -cc_binary( - name = "udp_socket_test", +cc_library( + name = "udp_socket_test_cases", testonly = 1, - srcs = ["udp_socket.cc"], - linkstatic = 1, + srcs = [ + "udp_socket_test_cases.cc", + ] + select_for_linux([ + "udp_socket_errqueue_test_case.cc", + ]), + hdrs = ["udp_socket_test_cases.h"], deps = [ ":socket_test_util", ":unix_domain_socket_test_util", @@ -3366,6 +3398,17 @@ cc_binary( "@com_google_absl//absl/time", "@com_google_googletest//:gtest", ], + alwayslink = 1, +) + +cc_binary( + name = "udp_socket_test", + testonly = 1, + srcs = ["udp_socket.cc"], + linkstatic = 1, + deps = [ + ":udp_socket_test_cases", + ], ) cc_binary( @@ -3682,3 +3725,24 @@ cc_binary( "@com_google_googletest//:gtest", ], ) + +cc_binary( + name = "xattr_test", + testonly = 1, + srcs = [ + "file_base.h", + "xattr.cc", + ], + linkstatic = 1, + deps = [ + "//test/util:capability_util", + "//test/util:file_descriptor", + "//test/util:fs_util", + "//test/util:posix_error", + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + "@com_google_absl//absl/strings", + "@com_google_googletest//:gtest", + ], +) diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc index b27d4e10a..a33daff17 100644 --- a/test/syscalls/linux/aio.cc +++ b/test/syscalls/linux/aio.cc @@ -129,7 +129,7 @@ TEST_F(AIOTest, BasicWrite) { // aio implementation uses aio_ring. gVisor doesn't and returns all zeroes. // Linux implements aio_ring, so skip the zeroes check. // - // TODO(b/65486370): Remove when gVisor implements aio_ring. + // TODO(gvisor.dev/issue/204): Remove when gVisor implements aio_ring. auto ring = reinterpret_cast<struct aio_ring*>(ctx_); auto magic = IsRunningOnGvisor() ? 0 : AIO_RING_MAGIC; EXPECT_EQ(ring->magic, magic); diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc index 2aa91691e..7f6015049 100644 --- a/test/syscalls/linux/clock_gettime.cc +++ b/test/syscalls/linux/clock_gettime.cc @@ -56,11 +56,6 @@ void spin_ns(int64_t ns) { // Test that CLOCK_PROCESS_CPUTIME_ID is a superset of CLOCK_THREAD_CPUTIME_ID. TEST(ClockGettime, CputimeId) { - // TODO(b/128871825,golang.org/issue/10958): Test times out when there is a - // small number of core because one goroutine starves the others. - printf("CPUS: %d\n", std::thread::hardware_concurrency()); - SKIP_IF(std::thread::hardware_concurrency() <= 2); - constexpr int kNumThreads = 13; // arbitrary absl::Duration spin_time = absl::Seconds(1); diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc index 370e85166..3d564e720 100644 --- a/test/syscalls/linux/exceptions.cc +++ b/test/syscalls/linux/exceptions.cc @@ -22,6 +22,23 @@ namespace gvisor { namespace testing { +// Default value for the x87 FPU control word. See Intel SDM Vol 1, Ch 8.1.5 +// "x87 FPU Control Word". +constexpr uint16_t kX87ControlWordDefault = 0x37f; + +// Mask for the divide-by-zero exception. +constexpr uint16_t kX87ControlWordDiv0Mask = 1 << 2; + +// Default value for the SSE control register (MXCSR). See Intel SDM Vol 1, Ch +// 11.6.4 "Initialization of SSE/SSE3 Extensions". +constexpr uint32_t kMXCSRDefault = 0x1f80; + +// Mask for the divide-by-zero exception. +constexpr uint32_t kMXCSRDiv0Mask = 1 << 9; + +// Flag for a pending divide-by-zero exception. +constexpr uint32_t kMXCSRDiv0Flag = 1 << 2; + void inline Halt() { asm("hlt\r\n"); } void inline SetAlignmentCheck() { @@ -107,6 +124,170 @@ TEST(ExceptionTest, DivideByZero) { ::testing::KilledBySignal(SIGFPE), ""); } +// By default, x87 exceptions are masked and simply return a default value. +TEST(ExceptionTest, X87DivideByZeroMasked) { + int32_t quotient; + int32_t value = 1; + int32_t divisor = 0; + asm("fildl %[value]\r\n" + "fidivl %[divisor]\r\n" + "fistpl %[quotient]\r\n" + : [ quotient ] "=m"(quotient) + : [ value ] "m"(value), [ divisor ] "m"(divisor)); + + EXPECT_EQ(quotient, INT32_MIN); +} + +// When unmasked, division by zero raises SIGFPE. +TEST(ExceptionTest, X87DivideByZeroUnmasked) { + // See above. + struct sigaction sa = {}; + sa.sa_handler = SIG_DFL; + auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa)); + + EXPECT_EXIT( + { + // Clear the divide by zero exception mask. + constexpr uint16_t kControlWord = + kX87ControlWordDefault & ~kX87ControlWordDiv0Mask; + + int32_t quotient; + int32_t value = 1; + int32_t divisor = 0; + asm volatile( + "fldcw %[cw]\r\n" + "fildl %[value]\r\n" + "fidivl %[divisor]\r\n" + "fistpl %[quotient]\r\n" + : [ quotient ] "=m"(quotient) + : [ cw ] "m"(kControlWord), [ value ] "m"(value), + [ divisor ] "m"(divisor)); + }, + ::testing::KilledBySignal(SIGFPE), ""); +} + +// Pending exceptions in the x87 status register are not clobbered by syscalls. +TEST(ExceptionTest, X87StatusClobber) { + // See above. + struct sigaction sa = {}; + sa.sa_handler = SIG_DFL; + auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa)); + + EXPECT_EXIT( + { + // Clear the divide by zero exception mask. + constexpr uint16_t kControlWord = + kX87ControlWordDefault & ~kX87ControlWordDiv0Mask; + + int32_t quotient; + int32_t value = 1; + int32_t divisor = 0; + asm volatile( + "fildl %[value]\r\n" + "fidivl %[divisor]\r\n" + // Exception is masked, so it does not occur here. + "fistpl %[quotient]\r\n" + + // SYS_getpid placed in rax by constraint. + "syscall\r\n" + + // Unmask exception. The syscall didn't clobber the pending + // exception, so now it can be raised. + // + // N.B. "a floating-point exception will be generated upon execution + // of the *next* floating-point instruction". + "fldcw %[cw]\r\n" + "fwait\r\n" + : [ quotient ] "=m"(quotient) + : [ value ] "m"(value), [ divisor ] "m"(divisor), "a"(SYS_getpid), + [ cw ] "m"(kControlWord) + : "rcx", "r11"); + }, + ::testing::KilledBySignal(SIGFPE), ""); +} + +// By default, SSE exceptions are masked and simply return a default value. +TEST(ExceptionTest, SSEDivideByZeroMasked) { + uint32_t status; + int32_t quotient; + int32_t value = 1; + int32_t divisor = 0; + asm("cvtsi2ssl %[value], %%xmm0\r\n" + "cvtsi2ssl %[divisor], %%xmm1\r\n" + "divss %%xmm1, %%xmm0\r\n" + "cvtss2sil %%xmm0, %[quotient]\r\n" + : [ quotient ] "=r"(quotient), [ status ] "=r"(status) + : [ value ] "r"(value), [ divisor ] "r"(divisor) + : "xmm0", "xmm1"); + + EXPECT_EQ(quotient, INT32_MIN); +} + +// When unmasked, division by zero raises SIGFPE. +TEST(ExceptionTest, SSEDivideByZeroUnmasked) { + // See above. + struct sigaction sa = {}; + sa.sa_handler = SIG_DFL; + auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa)); + + EXPECT_EXIT( + { + // Clear the divide by zero exception mask. + constexpr uint32_t kMXCSR = kMXCSRDefault & ~kMXCSRDiv0Mask; + + int32_t quotient; + int32_t value = 1; + int32_t divisor = 0; + asm volatile( + "ldmxcsr %[mxcsr]\r\n" + "cvtsi2ssl %[value], %%xmm0\r\n" + "cvtsi2ssl %[divisor], %%xmm1\r\n" + "divss %%xmm1, %%xmm0\r\n" + "cvtss2sil %%xmm0, %[quotient]\r\n" + : [ quotient ] "=r"(quotient) + : [ mxcsr ] "m"(kMXCSR), [ value ] "r"(value), + [ divisor ] "r"(divisor) + : "xmm0", "xmm1"); + }, + ::testing::KilledBySignal(SIGFPE), ""); +} + +// Pending exceptions in the SSE status register are not clobbered by syscalls. +TEST(ExceptionTest, SSEStatusClobber) { + uint32_t mxcsr; + int32_t quotient; + int32_t value = 1; + int32_t divisor = 0; + asm("cvtsi2ssl %[value], %%xmm0\r\n" + "cvtsi2ssl %[divisor], %%xmm1\r\n" + "divss %%xmm1, %%xmm0\r\n" + // Exception is masked, so it does not occur here. + "cvtss2sil %%xmm0, %[quotient]\r\n" + + // SYS_getpid placed in rax by constraint. + "syscall\r\n" + + // Intel SDM Vol 1, Ch 10.2.3.1 "SIMD Floating-Point Mask and Flag Bits": + // "If LDMXCSR or FXRSTOR clears a mask bit and sets the corresponding + // exception flag bit, a SIMD floating-point exception will not be + // generated as a result of this change. The unmasked exception will be + // generated only upon the execution of the next SSE/SSE2/SSE3 instruction + // that detects the unmasked exception condition." + // + // Though ambiguous, empirical evidence indicates that this means that + // exception flags set in the status register will never cause an + // exception to be raised; only a new exception condition will do so. + // + // Thus here we just check for the flag itself rather than trying to raise + // the exception. + "stmxcsr %[mxcsr]\r\n" + : [ quotient ] "=r"(quotient), [ mxcsr ] "+m"(mxcsr) + : [ value ] "r"(value), [ divisor ] "r"(divisor), "a"(SYS_getpid) + : "xmm0", "xmm1", "rcx", "r11"); + + EXPECT_TRUE(mxcsr & kMXCSRDiv0Flag); +} + TEST(ExceptionTest, IOAccessFault) { // See above. struct sigaction sa = {}; diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc index 581f03533..b5e0a512b 100644 --- a/test/syscalls/linux/exec.cc +++ b/test/syscalls/linux/exec.cc @@ -47,23 +47,14 @@ namespace testing { namespace { -constexpr char kBasicWorkload[] = "exec_basic_workload"; -constexpr char kExitScript[] = "exit_script"; -constexpr char kStateWorkload[] = "exec_state_workload"; -constexpr char kProcExeWorkload[] = "exec_proc_exe_workload"; -constexpr char kAssertClosedWorkload[] = "exec_assert_closed_workload"; -constexpr char kPriorityWorkload[] = "priority_execve"; - -std::string WorkloadPath(absl::string_view binary) { - std::string full_path; - char* test_src = getenv("TEST_SRCDIR"); - if (test_src) { - full_path = JoinPath(test_src, "__main__/test/syscalls/linux", binary); - } - - TEST_CHECK(full_path.empty() == false); - return full_path; -} +constexpr char kBasicWorkload[] = "test/syscalls/linux/exec_basic_workload"; +constexpr char kExitScript[] = "test/syscalls/linux/exit_script"; +constexpr char kStateWorkload[] = "test/syscalls/linux/exec_state_workload"; +constexpr char kProcExeWorkload[] = + "test/syscalls/linux/exec_proc_exe_workload"; +constexpr char kAssertClosedWorkload[] = + "test/syscalls/linux/exec_assert_closed_workload"; +constexpr char kPriorityWorkload[] = "test/syscalls/linux/priority_execve"; constexpr char kExit42[] = "--exec_exit_42"; constexpr char kExecWithThread[] = "--exec_exec_with_thread"; @@ -171,44 +162,44 @@ TEST(ExecTest, EmptyPath) { } TEST(ExecTest, Basic) { - CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {}, + CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)}, {}, ArgEnvExitStatus(0, 0), - absl::StrCat(WorkloadPath(kBasicWorkload), "\n")); + absl::StrCat(RunfilePath(kBasicWorkload), "\n")); } TEST(ExecTest, OneArg) { - CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "1"}, - {}, ArgEnvExitStatus(1, 0), - absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n")); + CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload), "1"}, {}, + ArgEnvExitStatus(1, 0), + absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n")); } TEST(ExecTest, FiveArg) { - CheckExec(WorkloadPath(kBasicWorkload), - {WorkloadPath(kBasicWorkload), "1", "2", "3", "4", "5"}, {}, + CheckExec(RunfilePath(kBasicWorkload), + {RunfilePath(kBasicWorkload), "1", "2", "3", "4", "5"}, {}, ArgEnvExitStatus(5, 0), - absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n")); + absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n2\n3\n4\n5\n")); } TEST(ExecTest, OneEnv) { - CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {"1"}, + CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)}, {"1"}, ArgEnvExitStatus(0, 1), - absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n")); + absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n")); } TEST(ExecTest, FiveEnv) { - CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, + CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)}, {"1", "2", "3", "4", "5"}, ArgEnvExitStatus(0, 5), - absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n")); + absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n2\n3\n4\n5\n")); } TEST(ExecTest, OneArgOneEnv) { - CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "arg"}, + CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload), "arg"}, {"env"}, ArgEnvExitStatus(1, 1), - absl::StrCat(WorkloadPath(kBasicWorkload), "\narg\nenv\n")); + absl::StrCat(RunfilePath(kBasicWorkload), "\narg\nenv\n")); } TEST(ExecTest, InterpreterScript) { - CheckExec(WorkloadPath(kExitScript), {WorkloadPath(kExitScript), "25"}, {}, + CheckExec(RunfilePath(kExitScript), {RunfilePath(kExitScript), "25"}, {}, ArgEnvExitStatus(25, 0), ""); } @@ -216,7 +207,7 @@ TEST(ExecTest, InterpreterScript) { TEST(ExecTest, InterpreterScriptArgSplit) { // Symlink through /tmp to ensure the path is short enough. TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload))); + TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload))); TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " foo bar"), @@ -230,7 +221,7 @@ TEST(ExecTest, InterpreterScriptArgSplit) { TEST(ExecTest, InterpreterScriptArgvZero) { // Symlink through /tmp to ensure the path is short enough. TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload))); + TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload))); TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755)); @@ -244,7 +235,7 @@ TEST(ExecTest, InterpreterScriptArgvZero) { TEST(ExecTest, InterpreterScriptArgvZeroRelative) { // Symlink through /tmp to ensure the path is short enough. TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload))); + TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload))); TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755)); @@ -261,7 +252,7 @@ TEST(ExecTest, InterpreterScriptArgvZeroRelative) { TEST(ExecTest, InterpreterScriptArgvZeroAdded) { // Symlink through /tmp to ensure the path is short enough. TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload))); + TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload))); TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755)); @@ -274,7 +265,7 @@ TEST(ExecTest, InterpreterScriptArgvZeroAdded) { TEST(ExecTest, InterpreterScriptArgNUL) { // Symlink through /tmp to ensure the path is short enough. TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload))); + TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload))); TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( GetAbsoluteTestTmpdir(), @@ -289,7 +280,7 @@ TEST(ExecTest, InterpreterScriptArgNUL) { TEST(ExecTest, InterpreterScriptTrailingWhitespace) { // Symlink through /tmp to ensure the path is short enough. TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload))); + TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload))); TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " "), 0755)); @@ -302,7 +293,7 @@ TEST(ExecTest, InterpreterScriptTrailingWhitespace) { TEST(ExecTest, InterpreterScriptArgWhitespace) { // Symlink through /tmp to ensure the path is short enough. TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload))); + TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload))); TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " foo"), 0755)); @@ -325,7 +316,7 @@ TEST(ExecTest, InterpreterScriptNoPath) { TEST(ExecTest, ExecFn) { // Symlink through /tmp to ensure the path is short enough. TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload))); + TempPath::CreateSymlinkTo("/tmp", RunfilePath(kStateWorkload))); TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " PrintExecFn"), @@ -342,7 +333,7 @@ TEST(ExecTest, ExecFn) { } TEST(ExecTest, ExecName) { - std::string path = WorkloadPath(kStateWorkload); + std::string path = RunfilePath(kStateWorkload); CheckExec(path, {path, "PrintExecName"}, {}, ArgEnvExitStatus(0, 0), absl::StrCat(Basename(path).substr(0, 15), "\n")); @@ -351,7 +342,7 @@ TEST(ExecTest, ExecName) { TEST(ExecTest, ExecNameScript) { // Symlink through /tmp to ensure the path is short enough. TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload))); + TempPath::CreateSymlinkTo("/tmp", RunfilePath(kStateWorkload))); TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( GetAbsoluteTestTmpdir(), @@ -405,13 +396,13 @@ TEST(ExecStateTest, HandlerReset) { ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds()); ExecveArray args = { - WorkloadPath(kStateWorkload), + RunfilePath(kStateWorkload), "CheckSigHandler", absl::StrCat(SIGUSR1), absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_DFL))), }; - CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), ""); + CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), ""); } // Ignored signal dispositions are not reset. @@ -421,13 +412,13 @@ TEST(ExecStateTest, IgnorePreserved) { ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds()); ExecveArray args = { - WorkloadPath(kStateWorkload), + RunfilePath(kStateWorkload), "CheckSigHandler", absl::StrCat(SIGUSR1), absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_IGN))), }; - CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), ""); + CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), ""); } // Signal masks are not reset on exec @@ -438,12 +429,12 @@ TEST(ExecStateTest, SignalMask) { ASSERT_THAT(sigprocmask(SIG_BLOCK, &s, nullptr), SyscallSucceeds()); ExecveArray args = { - WorkloadPath(kStateWorkload), + RunfilePath(kStateWorkload), "CheckSigBlocked", absl::StrCat(SIGUSR1), }; - CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), ""); + CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), ""); } // itimers persist across execve. @@ -471,7 +462,7 @@ TEST(ExecStateTest, ItimerPreserved) { } }; - std::string filename = WorkloadPath(kStateWorkload); + std::string filename = RunfilePath(kStateWorkload); ExecveArray argv = { filename, "CheckItimerEnabled", @@ -495,8 +486,8 @@ TEST(ExecStateTest, ItimerPreserved) { TEST(ProcSelfExe, ChangesAcrossExecve) { // See exec_proc_exe_workload for more details. We simply // assert that the /proc/self/exe link changes across execve. - CheckExec(WorkloadPath(kProcExeWorkload), - {WorkloadPath(kProcExeWorkload), + CheckExec(RunfilePath(kProcExeWorkload), + {RunfilePath(kProcExeWorkload), ASSERT_NO_ERRNO_AND_VALUE(ProcessExePath(getpid()))}, {}, W_EXITCODE(0, 0), ""); } @@ -507,8 +498,8 @@ TEST(ExecTest, CloexecNormalFile) { const FileDescriptor fd_closed_on_exec = ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY | O_CLOEXEC)); - CheckExec(WorkloadPath(kAssertClosedWorkload), - {WorkloadPath(kAssertClosedWorkload), + CheckExec(RunfilePath(kAssertClosedWorkload), + {RunfilePath(kAssertClosedWorkload), absl::StrCat(fd_closed_on_exec.get())}, {}, W_EXITCODE(0, 0), ""); @@ -517,10 +508,10 @@ TEST(ExecTest, CloexecNormalFile) { const FileDescriptor fd_open_on_exec = ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY)); - CheckExec(WorkloadPath(kAssertClosedWorkload), - {WorkloadPath(kAssertClosedWorkload), - absl::StrCat(fd_open_on_exec.get())}, - {}, W_EXITCODE(2, 0), ""); + CheckExec( + RunfilePath(kAssertClosedWorkload), + {RunfilePath(kAssertClosedWorkload), absl::StrCat(fd_open_on_exec.get())}, + {}, W_EXITCODE(2, 0), ""); } TEST(ExecTest, CloexecEventfd) { @@ -528,15 +519,15 @@ TEST(ExecTest, CloexecEventfd) { ASSERT_THAT(efd = eventfd(0, EFD_CLOEXEC), SyscallSucceeds()); FileDescriptor fd(efd); - CheckExec(WorkloadPath(kAssertClosedWorkload), - {WorkloadPath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {}, + CheckExec(RunfilePath(kAssertClosedWorkload), + {RunfilePath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {}, W_EXITCODE(0, 0), ""); } constexpr int kLinuxMaxSymlinks = 40; TEST(ExecTest, SymlinkLimitExceeded) { - std::string path = WorkloadPath(kBasicWorkload); + std::string path = RunfilePath(kBasicWorkload); // Hold onto TempPath objects so they are not destructed prematurely. std::vector<TempPath> symlinks; @@ -575,13 +566,13 @@ TEST(ExecTest, SymlinkLimitRefreshedForInterpreter) { } TEST(ExecveatTest, BasicWithFDCWD) { - std::string path = WorkloadPath(kBasicWorkload); + std::string path = RunfilePath(kBasicWorkload); CheckExecveat(AT_FDCWD, path, {path}, {}, /*flags=*/0, ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n")); } TEST(ExecveatTest, Basic) { - std::string absolute_path = WorkloadPath(kBasicWorkload); + std::string absolute_path = RunfilePath(kBasicWorkload); std::string parent_dir = std::string(Dirname(absolute_path)); std::string base = std::string(Basename(absolute_path)); const FileDescriptor dirfd = @@ -592,7 +583,7 @@ TEST(ExecveatTest, Basic) { } TEST(ExecveatTest, FDNotADirectory) { - std::string absolute_path = WorkloadPath(kBasicWorkload); + std::string absolute_path = RunfilePath(kBasicWorkload); std::string base = std::string(Basename(absolute_path)); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(absolute_path, 0)); @@ -604,13 +595,13 @@ TEST(ExecveatTest, FDNotADirectory) { } TEST(ExecveatTest, AbsolutePathWithFDCWD) { - std::string path = WorkloadPath(kBasicWorkload); + std::string path = RunfilePath(kBasicWorkload); CheckExecveat(AT_FDCWD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0, absl::StrCat(path, "\n")); } TEST(ExecveatTest, AbsolutePath) { - std::string path = WorkloadPath(kBasicWorkload); + std::string path = RunfilePath(kBasicWorkload); // File descriptor should be ignored when an absolute path is given. const int32_t badFD = -1; CheckExecveat(badFD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0, @@ -618,7 +609,7 @@ TEST(ExecveatTest, AbsolutePath) { } TEST(ExecveatTest, EmptyPathBasic) { - std::string path = WorkloadPath(kBasicWorkload); + std::string path = RunfilePath(kBasicWorkload); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH)); CheckExecveat(fd.get(), "", {path}, {}, AT_EMPTY_PATH, ArgEnvExitStatus(0, 0), @@ -626,7 +617,7 @@ TEST(ExecveatTest, EmptyPathBasic) { } TEST(ExecveatTest, EmptyPathWithDirFD) { - std::string path = WorkloadPath(kBasicWorkload); + std::string path = RunfilePath(kBasicWorkload); std::string parent_dir = std::string(Dirname(path)); const FileDescriptor dirfd = ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY)); @@ -639,7 +630,7 @@ TEST(ExecveatTest, EmptyPathWithDirFD) { } TEST(ExecveatTest, EmptyPathWithoutEmptyPathFlag) { - std::string path = WorkloadPath(kBasicWorkload); + std::string path = RunfilePath(kBasicWorkload); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH)); int execve_errno; @@ -649,7 +640,7 @@ TEST(ExecveatTest, EmptyPathWithoutEmptyPathFlag) { } TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) { - std::string path = WorkloadPath(kBasicWorkload); + std::string path = RunfilePath(kBasicWorkload); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH)); CheckExecveat(fd.get(), path, {path}, {}, AT_EMPTY_PATH, @@ -657,7 +648,7 @@ TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) { } TEST(ExecveatTest, RelativePathWithEmptyPathFlag) { - std::string absolute_path = WorkloadPath(kBasicWorkload); + std::string absolute_path = RunfilePath(kBasicWorkload); std::string parent_dir = std::string(Dirname(absolute_path)); std::string base = std::string(Basename(absolute_path)); const FileDescriptor dirfd = @@ -670,7 +661,7 @@ TEST(ExecveatTest, RelativePathWithEmptyPathFlag) { TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) { std::string parent_dir = "/tmp"; TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload))); + TempPath::CreateSymlinkTo(parent_dir, RunfilePath(kBasicWorkload))); const FileDescriptor dirfd = ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY)); std::string base = std::string(Basename(link.path())); @@ -685,7 +676,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) { TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) { std::string parent_dir = "/tmp"; TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload))); + TempPath::CreateSymlinkTo(parent_dir, RunfilePath(kBasicWorkload))); std::string path = link.path(); int execve_errno; @@ -697,7 +688,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) { TEST(ExecveatTest, SymlinkNoFollowAndEmptyPath) { TempPath link = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload))); + TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload))); std::string path = link.path(); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, 0)); @@ -723,7 +714,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) { } TEST(ExecveatTest, BasicWithCloexecFD) { - std::string path = WorkloadPath(kBasicWorkload); + std::string path = RunfilePath(kBasicWorkload); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC)); CheckExecveat(fd.get(), "", {path}, {}, AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH, @@ -731,7 +722,7 @@ TEST(ExecveatTest, BasicWithCloexecFD) { } TEST(ExecveatTest, InterpreterScriptWithCloexecFD) { - std::string path = WorkloadPath(kExitScript); + std::string path = RunfilePath(kExitScript); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC)); int execve_errno; @@ -742,7 +733,7 @@ TEST(ExecveatTest, InterpreterScriptWithCloexecFD) { } TEST(ExecveatTest, InterpreterScriptWithCloexecDirFD) { - std::string absolute_path = WorkloadPath(kExitScript); + std::string absolute_path = RunfilePath(kExitScript); std::string parent_dir = std::string(Dirname(absolute_path)); std::string base = std::string(Basename(absolute_path)); const FileDescriptor dirfd = @@ -775,7 +766,7 @@ TEST(GetpriorityTest, ExecveMaintainsPriority) { // Program run (priority_execve) will exit(X) where // X=getpriority(PRIO_PROCESS,0). Check that this exit value is prio. - CheckExec(WorkloadPath(kPriorityWorkload), {WorkloadPath(kPriorityWorkload)}, + CheckExec(RunfilePath(kPriorityWorkload), {RunfilePath(kPriorityWorkload)}, {}, W_EXITCODE(expected_exit_code, 0), ""); } diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index 8a45be12a..4f3aa81d6 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -14,6 +14,7 @@ #include <fcntl.h> #include <signal.h> +#include <sys/types.h> #include <syscall.h> #include <unistd.h> @@ -32,6 +33,7 @@ #include "test/util/eventfd_util.h" #include "test/util/multiprocess_util.h" #include "test/util/posix_error.h" +#include "test/util/save_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/timer_util.h" @@ -910,8 +912,166 @@ TEST(FcntlTest, GetOwn) { FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), + EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), 0); + MaybeSave(); +} + +TEST(FcntlTest, GetOwnEx) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + f_owner_ex owner = {}; + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &owner), + SyscallSucceedsWithValue(0)); +} + +TEST(FcntlTest, SetOwnExInvalidType) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + f_owner_ex owner = {}; + owner.type = __pid_type(-1); + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + SyscallFailsWithErrno(EINVAL)); +} + +TEST(FcntlTest, SetOwnExInvalidTid) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + f_owner_ex owner = {}; + owner.type = F_OWNER_TID; + owner.pid = -1; + + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + SyscallFailsWithErrno(ESRCH)); +} + +TEST(FcntlTest, SetOwnExInvalidPid) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + f_owner_ex owner = {}; + owner.type = F_OWNER_PID; + owner.pid = -1; + + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + SyscallFailsWithErrno(ESRCH)); +} + +TEST(FcntlTest, SetOwnExInvalidPgrp) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + f_owner_ex owner = {}; + owner.type = F_OWNER_PGRP; + owner.pid = -1; + + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + SyscallFailsWithErrno(ESRCH)); +} + +TEST(FcntlTest, SetOwnExTid) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + f_owner_ex owner = {}; + owner.type = F_OWNER_TID; + EXPECT_THAT(owner.pid = syscall(__NR_gettid), SyscallSucceeds()); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + SyscallSucceeds()); + + EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid); + MaybeSave(); +} + +TEST(FcntlTest, SetOwnExPid) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + f_owner_ex owner = {}; + owner.type = F_OWNER_PID; + EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds()); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + SyscallSucceeds()); + + EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid); + MaybeSave(); +} + +TEST(FcntlTest, SetOwnExPgrp) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + f_owner_ex owner = {}; + owner.type = F_OWNER_PGRP; + EXPECT_THAT(owner.pid = getpgrp(), SyscallSucceeds()); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), + SyscallSucceeds()); + + // NOTE(igudger): I don't understand why, but this is flaky on Linux. + // GetOwnExPgrp (below) does not have this issue. + SKIP_IF(!IsRunningOnGvisor()); + + EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), -owner.pid); + MaybeSave(); +} + +TEST(FcntlTest, GetOwnExTid) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + f_owner_ex set_owner = {}; + set_owner.type = F_OWNER_TID; + EXPECT_THAT(set_owner.pid = syscall(__NR_gettid), SyscallSucceeds()); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), + SyscallSucceeds()); + + f_owner_ex got_owner = {}; + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(got_owner.type, set_owner.type); + EXPECT_EQ(got_owner.pid, set_owner.pid); +} + +TEST(FcntlTest, GetOwnExPid) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + f_owner_ex set_owner = {}; + set_owner.type = F_OWNER_PID; + EXPECT_THAT(set_owner.pid = getpid(), SyscallSucceeds()); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), + SyscallSucceeds()); + + f_owner_ex got_owner = {}; + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(got_owner.type, set_owner.type); + EXPECT_EQ(got_owner.pid, set_owner.pid); +} + +TEST(FcntlTest, GetOwnExPgrp) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + f_owner_ex set_owner = {}; + set_owner.type = F_OWNER_PGRP; + EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds()); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), + SyscallSucceeds()); + + f_owner_ex got_owner = {}; + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner), SyscallSucceedsWithValue(0)); + EXPECT_EQ(got_owner.type, set_owner.type); + EXPECT_EQ(got_owner.pid, set_owner.pid); } } // namespace diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h index 4e048320e..6f80bc97c 100644 --- a/test/syscalls/linux/file_base.h +++ b/test/syscalls/linux/file_base.h @@ -111,95 +111,6 @@ class FileTest : public ::testing::Test { int test_pipe_[2]; }; -class SocketTest : public ::testing::Test { - public: - void SetUp() override { - test_unix_stream_socket_[0] = -1; - test_unix_stream_socket_[1] = -1; - test_unix_dgram_socket_[0] = -1; - test_unix_dgram_socket_[1] = -1; - test_unix_seqpacket_socket_[0] = -1; - test_unix_seqpacket_socket_[1] = -1; - test_tcp_socket_[0] = -1; - test_tcp_socket_[1] = -1; - - ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, test_unix_stream_socket_), - SyscallSucceeds()); - ASSERT_THAT(fcntl(test_unix_stream_socket_[0], F_SETFL, O_NONBLOCK), - SyscallSucceeds()); - ASSERT_THAT(socketpair(AF_UNIX, SOCK_DGRAM, 0, test_unix_dgram_socket_), - SyscallSucceeds()); - ASSERT_THAT(fcntl(test_unix_dgram_socket_[0], F_SETFL, O_NONBLOCK), - SyscallSucceeds()); - ASSERT_THAT( - socketpair(AF_UNIX, SOCK_SEQPACKET, 0, test_unix_seqpacket_socket_), - SyscallSucceeds()); - ASSERT_THAT(fcntl(test_unix_seqpacket_socket_[0], F_SETFL, O_NONBLOCK), - SyscallSucceeds()); - } - - void TearDown() override { - close(test_unix_stream_socket_[0]); - close(test_unix_stream_socket_[1]); - - close(test_unix_dgram_socket_[0]); - close(test_unix_dgram_socket_[1]); - - close(test_unix_seqpacket_socket_[0]); - close(test_unix_seqpacket_socket_[1]); - - close(test_tcp_socket_[0]); - close(test_tcp_socket_[1]); - } - - int test_unix_stream_socket_[2]; - int test_unix_dgram_socket_[2]; - int test_unix_seqpacket_socket_[2]; - int test_tcp_socket_[2]; -}; - -// MatchesStringLength checks that a tuple argument of (struct iovec *, int) -// corresponding to an iovec array and its length, contains data that matches -// the string length strlen. -MATCHER_P(MatchesStringLength, strlen, "") { - struct iovec* iovs = arg.first; - int niov = arg.second; - int offset = 0; - for (int i = 0; i < niov; i++) { - offset += iovs[i].iov_len; - } - if (offset != static_cast<int>(strlen)) { - *result_listener << offset; - return false; - } - return true; -} - -// MatchesStringValue checks that a tuple argument of (struct iovec *, int) -// corresponding to an iovec array and its length, contains data that matches -// the string value str. -MATCHER_P(MatchesStringValue, str, "") { - struct iovec* iovs = arg.first; - int len = strlen(str); - int niov = arg.second; - int offset = 0; - for (int i = 0; i < niov; i++) { - struct iovec iov = iovs[i]; - if (len < offset) { - *result_listener << "strlen " << len << " < offset " << offset; - return false; - } - if (strncmp(static_cast<char*>(iov.iov_base), &str[offset], iov.iov_len)) { - absl::string_view iovec_string(static_cast<char*>(iov.iov_base), - iov.iov_len); - *result_listener << iovec_string << " @offset " << offset; - return false; - } - offset += iov.iov_len; - } - return true; -} - } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc index d3e3f998c..40c80a6e1 100644 --- a/test/syscalls/linux/futex.cc +++ b/test/syscalls/linux/futex.cc @@ -239,6 +239,27 @@ TEST_P(PrivateAndSharedFutexTest, Wake1_NoRandomSave) { EXPECT_THAT(futex_wake(IsPrivate(), &a, 1), SyscallSucceedsWithValue(1)); } +TEST_P(PrivateAndSharedFutexTest, Wake0_NoRandomSave) { + constexpr int kInitialValue = 1; + std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue); + + // Prevent save/restore from interrupting futex_wait, which will cause it to + // return EAGAIN instead of the expected result if futex_wait is restarted + // after we change the value of a below. + DisableSave ds; + ScopedThread thread([&] { + EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue), + SyscallSucceedsWithValue(0)); + }); + absl::SleepFor(kWaiterStartupDelay); + + // Change a so that if futex_wake happens before futex_wait, the latter + // returns EAGAIN instead of hanging the test. + a.fetch_add(1); + // The Linux kernel wakes one waiter even if val is 0 or negative. + EXPECT_THAT(futex_wake(IsPrivate(), &a, 0), SyscallSucceedsWithValue(1)); +} + TEST_P(PrivateAndSharedFutexTest, WakeAll_NoRandomSave) { constexpr int kInitialValue = 1; std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue); diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index 7384c27dc..59ec9940a 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -1591,6 +1591,34 @@ TEST(Inotify, EpollNoDeadlock) { } } +TEST(Inotify, SpliceEvent) { + int pipes[2]; + ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds()); + + const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( + root.path(), "some content", TempPath::kDefaultFileMode)); + + const FileDescriptor file1_fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY)); + const int watcher = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS)); + + char buf; + EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds()); + + EXPECT_THAT(splice(fd.get(), nullptr, pipes[1], nullptr, + sizeof(struct inotify_event) + 1, SPLICE_F_NONBLOCK), + SyscallSucceedsWithValue(sizeof(struct inotify_event))); + + const FileDescriptor read_fd(pipes[0]); + const std::vector<Event> events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(read_fd.get())); + ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)})); +} + } // namespace } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc index c4f8bff08..b0a07a064 100644 --- a/test/syscalls/linux/ioctl.cc +++ b/test/syscalls/linux/ioctl.cc @@ -215,7 +215,8 @@ TEST_F(IoctlTest, FIOASYNCSelfTarget2) { auto mask_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO)); - pid_t pid = getpid(); + pid_t pid = -1; + EXPECT_THAT(pid = getpid(), SyscallSucceeds()); EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds()); int set = 1; diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h index 072230d85..9cb4566db 100644 --- a/test/syscalls/linux/ip_socket_test_util.h +++ b/test/syscalls/linux/ip_socket_test_util.h @@ -26,25 +26,6 @@ namespace gvisor { namespace testing { -// Possible values of the "st" field in a /proc/net/{tcp,udp} entry. Source: -// Linux kernel, include/net/tcp_states.h. -enum { - TCP_ESTABLISHED = 1, - TCP_SYN_SENT, - TCP_SYN_RECV, - TCP_FIN_WAIT1, - TCP_FIN_WAIT2, - TCP_TIME_WAIT, - TCP_CLOSE, - TCP_CLOSE_WAIT, - TCP_LAST_ACK, - TCP_LISTEN, - TCP_CLOSING, - TCP_NEW_SYN_RECV, - - TCP_MAX_STATES -}; - // Extracts the IP address from an inet sockaddr in network byte order. uint32_t IPFromInetSockaddr(const struct sockaddr* addr); diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc index 6f2639d8a..1c4d9f1c7 100644 --- a/test/syscalls/linux/mmap.cc +++ b/test/syscalls/linux/mmap.cc @@ -814,23 +814,27 @@ class MMapFileTest : public MMapTest { } }; +class MMapFileParamTest + : public MMapFileTest, + public ::testing::WithParamInterface<std::tuple<int, int>> { + protected: + int prot() const { return std::get<0>(GetParam()); } + + int flags() const { return std::get<1>(GetParam()); } +}; + // MAP_POPULATE allowed. // There isn't a good way to verify it actually did anything. -// -// FIXME(b/37222275): Parameterize. -TEST_F(MMapFileTest, MapPopulate) { - ASSERT_THAT( - Map(0, kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd_.get(), 0), - SyscallSucceeds()); +TEST_P(MMapFileParamTest, MapPopulate) { + ASSERT_THAT(Map(0, kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0), + SyscallSucceeds()); } // MAP_POPULATE on a short file. -// -// FIXME(b/37222275): Parameterize. -TEST_F(MMapFileTest, MapPopulateShort) { - ASSERT_THAT(Map(0, 2 * kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, - fd_.get(), 0), - SyscallSucceeds()); +TEST_P(MMapFileParamTest, MapPopulateShort) { + ASSERT_THAT( + Map(0, 2 * kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0), + SyscallSucceeds()); } // Read contents from mapped file. @@ -901,16 +905,6 @@ TEST_F(MMapFileTest, WritePrivateOnReadOnlyFd) { reinterpret_cast<volatile char*>(addr)); } -// MAP_PRIVATE PROT_READ is not allowed on write-only FDs. -TEST_F(MMapFileTest, ReadPrivateOnWriteOnlyFd) { - const FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY)); - - uintptr_t addr; - EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fd.get(), 0), - SyscallFailsWithErrno(EACCES)); -} - // MAP_SHARED PROT_WRITE not allowed on read-only FDs. TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) { const FileDescriptor fd = @@ -922,28 +916,13 @@ TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) { SyscallFailsWithErrno(EACCES)); } -// MAP_SHARED PROT_READ not allowed on write-only FDs. -// -// FIXME(b/37222275): Parameterize. -TEST_F(MMapFileTest, ReadSharedOnWriteOnlyFd) { - const FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY)); - - uintptr_t addr; - EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd.get(), 0), - SyscallFailsWithErrno(EACCES)); -} - -// MAP_SHARED PROT_WRITE not allowed on write-only FDs. -// The FD must always be readable. -// -// FIXME(b/37222275): Parameterize. -TEST_F(MMapFileTest, WriteSharedOnWriteOnlyFd) { +// The FD must be readable. +TEST_P(MMapFileParamTest, WriteOnlyFd) { const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY)); uintptr_t addr; - EXPECT_THAT(addr = Map(0, kPageSize, PROT_WRITE, MAP_SHARED, fd.get(), 0), + EXPECT_THAT(addr = Map(0, kPageSize, prot(), flags(), fd.get(), 0), SyscallFailsWithErrno(EACCES)); } @@ -1182,7 +1161,7 @@ TEST_F(MMapFileTest, ReadSharedTruncateDownThenUp) { ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0), SyscallSucceeds()); - // Check that the memory contains he file data. + // Check that the memory contains the file data. EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), buf.c_str(), kPageSize)); // Truncate down, then up. @@ -1371,125 +1350,68 @@ TEST_F(MMapFileTest, WritePrivate) { EqualsMemory(std::string(len, '\0'))); } -// SIGBUS raised when writing past end of file to a private mapping. -// -// FIXME(b/37222275): Parameterize. -TEST_F(MMapFileTest, SigBusDeathWritePrivate) { +// SIGBUS raised when reading or writing past end of a mapped file. +TEST_P(MMapFileParamTest, SigBusDeath) { SetupGvisorDeathTest(); uintptr_t addr; - ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, - fd_.get(), 0), + ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0), SyscallSucceeds()); - // MMapFileTest makes a file kPageSize/2 long. The entire first page will be - // accessible. Write just beyond that. - size_t len = strlen(kFileContents); - EXPECT_EXIT(std::copy(kFileContents, kFileContents + len, - reinterpret_cast<volatile char*>(addr + kPageSize)), - ::testing::KilledBySignal(SIGBUS), ""); -} - -// SIGBUS raised when reading past end of file on a shared mapping. -// -// FIXME(b/37222275): Parameterize. -TEST_F(MMapFileTest, SigBusDeathReadShared) { - SetupGvisorDeathTest(); - - uintptr_t addr; - ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0), - SyscallSucceeds()); - - // MMapFileTest makes a file kPageSize/2 long. The entire first page will be - // accessible. Read just beyond that. - std::vector<char> in(kPageSize); - EXPECT_EXIT( - std::copy(reinterpret_cast<volatile char*>(addr + kPageSize), - reinterpret_cast<volatile char*>(addr + kPageSize) + kPageSize, - in.data()), - ::testing::KilledBySignal(SIGBUS), ""); + auto* start = reinterpret_cast<volatile char*>(addr + kPageSize); + + // MMapFileTest makes a file kPageSize/2 long. The entire first page should be + // accessible, but anything beyond it should not. + if (prot() & PROT_WRITE) { + // Write beyond first page. + size_t len = strlen(kFileContents); + EXPECT_EXIT(std::copy(kFileContents, kFileContents + len, start), + ::testing::KilledBySignal(SIGBUS), ""); + } else { + // Read beyond first page. + std::vector<char> in(kPageSize); + EXPECT_EXIT(std::copy(start, start + kPageSize, in.data()), + ::testing::KilledBySignal(SIGBUS), ""); + } } -// SIGBUS raised when reading past end of file on a shared mapping. +// Tests that SIGBUS is not raised when reading or writing to a file-mapped +// page before EOF, even if part of the mapping extends beyond EOF. // -// FIXME(b/37222275): Parameterize. -TEST_F(MMapFileTest, SigBusDeathWriteShared) { - SetupGvisorDeathTest(); - +// See b/27877699. +TEST_P(MMapFileParamTest, NoSigBusOnPagesBeforeEOF) { uintptr_t addr; - ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, - fd_.get(), 0), - SyscallSucceeds()); - - // MMapFileTest makes a file kPageSize/2 long. The entire first page will be - // accessible. Write just beyond that. - size_t len = strlen(kFileContents); - EXPECT_EXIT(std::copy(kFileContents, kFileContents + len, - reinterpret_cast<volatile char*>(addr + kPageSize)), - ::testing::KilledBySignal(SIGBUS), ""); -} - -// Tests that SIGBUS is not raised when writing to a file-mapped page before -// EOF, even if part of the mapping extends beyond EOF. -TEST_F(MMapFileTest, NoSigBusOnPagesBeforeEOF) { - uintptr_t addr; - ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, - fd_.get(), 0), + ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0), SyscallSucceeds()); // The test passes if this survives. - size_t len = strlen(kFileContents); - std::copy(kFileContents, kFileContents + len, - reinterpret_cast<volatile char*>(addr)); -} - -// Tests that SIGBUS is not raised when writing to a file-mapped page containing -// EOF, *after* the EOF for a private mapping. -TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWritePrivate) { - uintptr_t addr; - ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, - fd_.get(), 0), - SyscallSucceeds()); - - // The test passes if this survives. (Technically addr+kPageSize/2 is already - // beyond EOF, but +1 to check for fencepost errors.) - size_t len = strlen(kFileContents); - std::copy(kFileContents, kFileContents + len, - reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1)); -} - -// Tests that SIGBUS is not raised when reading from a file-mapped page -// containing EOF, *after* the EOF for a shared mapping. -// -// FIXME(b/37222275): Parameterize. -TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFReadShared) { - uintptr_t addr; - ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0), - SyscallSucceeds()); - - // The test passes if this survives. (Technically addr+kPageSize/2 is already - // beyond EOF, but +1 to check for fencepost errors.) auto* start = reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1); size_t len = strlen(kFileContents); - std::vector<char> in(len); - std::copy(start, start + len, in.data()); + if (prot() & PROT_WRITE) { + std::copy(kFileContents, kFileContents + len, start); + } else { + std::vector<char> in(len); + std::copy(start, start + len, in.data()); + } } -// Tests that SIGBUS is not raised when writing to a file-mapped page containing -// EOF, *after* the EOF for a shared mapping. -// -// FIXME(b/37222275): Parameterize. -TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWriteShared) { +// Tests that SIGBUS is not raised when reading or writing from a file-mapped +// page containing EOF, *after* the EOF. +TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) { uintptr_t addr; - ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, - fd_.get(), 0), + ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0), SyscallSucceeds()); // The test passes if this survives. (Technically addr+kPageSize/2 is already // beyond EOF, but +1 to check for fencepost errors.) + auto* start = reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1); size_t len = strlen(kFileContents); - std::copy(kFileContents, kFileContents + len, - reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1)); + if (prot() & PROT_WRITE) { + std::copy(kFileContents, kFileContents + len, start); + } else { + std::vector<char> in(len); + std::copy(start, start + len, in.data()); + } } // Tests that reading from writable shared file-mapped pages succeeds. @@ -1733,6 +1655,15 @@ TEST(MMapNoFixtureTest, Map32Bit) { #endif // defined(__x86_64__) +INSTANTIATE_TEST_SUITE_P( + ReadWriteSharedPrivate, MMapFileParamTest, + ::testing::Combine(::testing::ValuesIn({ + PROT_READ, + PROT_WRITE, + PROT_READ | PROT_WRITE, + }), + ::testing::ValuesIn({MAP_SHARED, MAP_PRIVATE}))); + } // namespace } // namespace testing diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc index 33822ee57..df7129acc 100644 --- a/test/syscalls/linux/partial_bad_buffer.cc +++ b/test/syscalls/linux/partial_bad_buffer.cc @@ -18,7 +18,9 @@ #include <netinet/tcp.h> #include <sys/mman.h> #include <sys/socket.h> +#include <sys/stat.h> #include <sys/syscall.h> +#include <sys/types.h> #include <sys/uio.h> #include <unistd.h> @@ -62,9 +64,9 @@ class PartialBadBufferTest : public ::testing::Test { // Write some initial data. size_t size = sizeof(kMessage) - 1; EXPECT_THAT(WriteFd(fd_, &kMessage, size), SyscallSucceedsWithValue(size)); - ASSERT_THAT(lseek(fd_, 0, SEEK_SET), SyscallSucceeds()); + // Map a useable buffer. addr_ = mmap(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ASSERT_NE(addr_, MAP_FAILED); @@ -79,6 +81,15 @@ class PartialBadBufferTest : public ::testing::Test { bad_buffer_ = buf + kPageSize - 1; } + off_t Size() { + struct stat st; + int rc = fstat(fd_, &st); + if (rc < 0) { + return static_cast<off_t>(rc); + } + return st.st_size; + } + void TearDown() override { EXPECT_THAT(munmap(addr_, 2 * kPageSize), SyscallSucceeds()) << addr_; EXPECT_THAT(close(fd_), SyscallSucceeds()); @@ -165,97 +176,99 @@ TEST_F(PartialBadBufferTest, PreadvSmall) { } TEST_F(PartialBadBufferTest, WriteBig) { - // FIXME(b/24788078): The sentry write syscalls will return immediately - // if Access returns an error, but Access may not return an error - // and the sentry will instead perform a partial write. - SKIP_IF(IsRunningOnGvisor()); + off_t orig_size = Size(); + int n; - EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, kPageSize), - SyscallFailsWithErrno(EFAULT)); + ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds()); + EXPECT_THAT( + (n = RetryEINTR(write)(fd_, bad_buffer_, kPageSize)), + AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1))); + EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0)); } TEST_F(PartialBadBufferTest, WriteSmall) { - // FIXME(b/24788078): The sentry write syscalls will return immediately - // if Access returns an error, but Access may not return an error - // and the sentry will instead perform a partial write. - SKIP_IF(IsRunningOnGvisor()); + off_t orig_size = Size(); + int n; - EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, 10), - SyscallFailsWithErrno(EFAULT)); + ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds()); + EXPECT_THAT( + (n = RetryEINTR(write)(fd_, bad_buffer_, 10)), + AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1))); + EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0)); } TEST_F(PartialBadBufferTest, PwriteBig) { - // FIXME(b/24788078): The sentry write syscalls will return immediately - // if Access returns an error, but Access may not return an error - // and the sentry will instead perform a partial write. - SKIP_IF(IsRunningOnGvisor()); + off_t orig_size = Size(); + int n; - EXPECT_THAT(RetryEINTR(pwrite)(fd_, bad_buffer_, kPageSize, 0), - SyscallFailsWithErrno(EFAULT)); + EXPECT_THAT( + (n = RetryEINTR(pwrite)(fd_, bad_buffer_, kPageSize, orig_size)), + AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1))); + EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0)); } TEST_F(PartialBadBufferTest, PwriteSmall) { - // FIXME(b/24788078): The sentry write syscalls will return immediately - // if Access returns an error, but Access may not return an error - // and the sentry will instead perform a partial write. - SKIP_IF(IsRunningOnGvisor()); + off_t orig_size = Size(); + int n; - EXPECT_THAT(RetryEINTR(pwrite)(fd_, bad_buffer_, 10, 0), - SyscallFailsWithErrno(EFAULT)); + EXPECT_THAT( + (n = RetryEINTR(pwrite)(fd_, bad_buffer_, 10, orig_size)), + AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1))); + EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0)); } TEST_F(PartialBadBufferTest, WritevBig) { - // FIXME(b/24788078): The sentry write syscalls will return immediately - // if Access returns an error, but Access may not return an error - // and the sentry will instead perform a partial write. - SKIP_IF(IsRunningOnGvisor()); - struct iovec vec; vec.iov_base = bad_buffer_; vec.iov_len = kPageSize; + off_t orig_size = Size(); + int n; - EXPECT_THAT(RetryEINTR(writev)(fd_, &vec, 1), SyscallFailsWithErrno(EFAULT)); + ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds()); + EXPECT_THAT( + (n = RetryEINTR(writev)(fd_, &vec, 1)), + AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1))); + EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0)); } TEST_F(PartialBadBufferTest, WritevSmall) { - // FIXME(b/24788078): The sentry write syscalls will return immediately - // if Access returns an error, but Access may not return an error - // and the sentry will instead perform a partial write. - SKIP_IF(IsRunningOnGvisor()); - struct iovec vec; vec.iov_base = bad_buffer_; vec.iov_len = 10; + off_t orig_size = Size(); + int n; - EXPECT_THAT(RetryEINTR(writev)(fd_, &vec, 1), SyscallFailsWithErrno(EFAULT)); + ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds()); + EXPECT_THAT( + (n = RetryEINTR(writev)(fd_, &vec, 1)), + AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1))); + EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0)); } TEST_F(PartialBadBufferTest, PwritevBig) { - // FIXME(b/24788078): The sentry write syscalls will return immediately - // if Access returns an error, but Access may not return an error - // and the sentry will instead perform a partial write. - SKIP_IF(IsRunningOnGvisor()); - struct iovec vec; vec.iov_base = bad_buffer_; vec.iov_len = kPageSize; + off_t orig_size = Size(); + int n; - EXPECT_THAT(RetryEINTR(pwritev)(fd_, &vec, 1, 0), - SyscallFailsWithErrno(EFAULT)); + EXPECT_THAT( + (n = RetryEINTR(pwritev)(fd_, &vec, 1, orig_size)), + AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1))); + EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0)); } TEST_F(PartialBadBufferTest, PwritevSmall) { - // FIXME(b/24788078): The sentry write syscalls will return immediately - // if Access returns an error, but Access may not return an error - // and the sentry will instead perform a partial write. - SKIP_IF(IsRunningOnGvisor()); - struct iovec vec; vec.iov_base = bad_buffer_; vec.iov_len = 10; + off_t orig_size = Size(); + int n; - EXPECT_THAT(RetryEINTR(pwritev)(fd_, &vec, 1, 0), - SyscallFailsWithErrno(EFAULT)); + EXPECT_THAT( + (n = RetryEINTR(pwritev)(fd_, &vec, 1, orig_size)), + AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1))); + EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0)); } // getdents returns EFAULT when the you claim the buffer is large enough, but @@ -283,29 +296,6 @@ TEST_F(PartialBadBufferTest, GetdentsOneEntry) { SyscallSucceedsWithValue(Gt(0))); } -// Verify that when write returns EFAULT the kernel hasn't silently written -// the initial valid bytes. -TEST_F(PartialBadBufferTest, WriteEfaultIsntPartial) { - // FIXME(b/24788078): The sentry write syscalls will return immediately - // if Access returns an error, but Access may not return an error - // and the sentry will instead perform a partial write. - SKIP_IF(IsRunningOnGvisor()); - - bad_buffer_[0] = 'A'; - EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, 10), - SyscallFailsWithErrno(EFAULT)); - - size_t size = 255; - char buf[255]; - memset(buf, 0, size); - - EXPECT_THAT(RetryEINTR(pread)(fd_, buf, size, 0), - SyscallSucceedsWithValue(sizeof(kMessage) - 1)); - - // 'A' has not been written. - EXPECT_STREQ(buf, kMessage); -} - PosixErrorOr<sockaddr_storage> InetLoopbackAddr(int family) { struct sockaddr_storage addr; memset(&addr, 0, sizeof(addr)); diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc index c0b354e65..ac9b21b24 100644 --- a/test/syscalls/linux/pipe.cc +++ b/test/syscalls/linux/pipe.cc @@ -212,6 +212,20 @@ TEST(Pipe2Test, BadOptions) { EXPECT_THAT(pipe2(fds, 0xDEAD), SyscallFailsWithErrno(EINVAL)); } +// Tests that opening named pipes with O_TRUNC shouldn't cause an error, but +// calls to (f)truncate should. +TEST(NamedPipeTest, Truncate) { + const std::string tmp_path = NewTempAbsPath(); + SKIP_IF(mkfifo(tmp_path.c_str(), 0644) != 0); + + ASSERT_THAT(open(tmp_path.c_str(), O_NONBLOCK | O_RDONLY), SyscallSucceeds()); + FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE( + Open(tmp_path.c_str(), O_RDWR | O_NONBLOCK | O_TRUNC)); + + ASSERT_THAT(truncate(tmp_path.c_str(), 0), SyscallFailsWithErrno(EINVAL)); + ASSERT_THAT(ftruncate(fd.get(), 0), SyscallFailsWithErrno(EINVAL)); +} + TEST_P(PipeTest, Seek) { SKIP_IF(!CreateBlocking()); diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index 512de5ee0..8cf08991b 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -37,6 +37,7 @@ #include <map> #include <memory> #include <ostream> +#include <regex> #include <string> #include <unordered_set> #include <utility> @@ -51,6 +52,7 @@ #include "absl/strings/str_split.h" #include "absl/strings/string_view.h" #include "absl/synchronization/mutex.h" +#include "absl/synchronization/notification.h" #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/util/capability_util.h" @@ -1988,6 +1990,44 @@ TEST(Proc, GetdentsEnoent) { SyscallFailsWithErrno(ENOENT)); } +void CheckSyscwFromIOFile(const std::string& path, const std::string& regex) { + std::string output; + ASSERT_NO_ERRNO(GetContents(path, &output)); + ASSERT_THAT(output, ContainsRegex(absl::StrCat("syscw:\\s+", regex, "\n"))); +} + +// Checks that there is variable accounting of IO between threads/tasks. +TEST(Proc, PidTidIOAccounting) { + absl::Notification notification; + + // Run a thread with a bunch of writes. Check that io account records exactly + // the number of write calls. File open/close is there to prevent buffering. + ScopedThread writer([¬ification] { + const int num_writes = 100; + for (int i = 0; i < num_writes; i++) { + auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + ASSERT_NO_ERRNO(SetContents(path.path(), "a")); + } + notification.Notify(); + const std::string& writer_dir = + absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io"); + + CheckSyscwFromIOFile(writer_dir, std::to_string(num_writes)); + }); + + // Run a thread and do no writes. Check that no writes are recorded. + ScopedThread noop([¬ification] { + notification.WaitForNotification(); + const std::string& noop_dir = + absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io"); + + CheckSyscwFromIOFile(noop_dir, "0"); + }); + + writer.Join(); + noop.Join(); +} + } // namespace } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc index 2659f6a98..5b6e3e3cd 100644 --- a/test/syscalls/linux/proc_net_tcp.cc +++ b/test/syscalls/linux/proc_net_tcp.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <netinet/tcp.h> #include <sys/socket.h> #include <sys/stat.h> #include <sys/types.h> diff --git a/test/syscalls/linux/proc_net_udp.cc b/test/syscalls/linux/proc_net_udp.cc index f06f1a24b..786b4b4af 100644 --- a/test/syscalls/linux/proc_net_udp.cc +++ b/test/syscalls/linux/proc_net_udp.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <netinet/tcp.h> #include <sys/socket.h> #include <sys/stat.h> #include <sys/types.h> diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc index 9658f7d42..491d5f40f 100644 --- a/test/syscalls/linux/readv_common.cc +++ b/test/syscalls/linux/readv_common.cc @@ -19,12 +19,53 @@ #include <unistd.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/file_base.h" #include "test/util/test_util.h" namespace gvisor { namespace testing { +// MatchesStringLength checks that a tuple argument of (struct iovec *, int) +// corresponding to an iovec array and its length, contains data that matches +// the string length strlen. +MATCHER_P(MatchesStringLength, strlen, "") { + struct iovec* iovs = arg.first; + int niov = arg.second; + int offset = 0; + for (int i = 0; i < niov; i++) { + offset += iovs[i].iov_len; + } + if (offset != static_cast<int>(strlen)) { + *result_listener << offset; + return false; + } + return true; +} + +// MatchesStringValue checks that a tuple argument of (struct iovec *, int) +// corresponding to an iovec array and its length, contains data that matches +// the string value str. +MATCHER_P(MatchesStringValue, str, "") { + struct iovec* iovs = arg.first; + int len = strlen(str); + int niov = arg.second; + int offset = 0; + for (int i = 0; i < niov; i++) { + struct iovec iov = iovs[i]; + if (len < offset) { + *result_listener << "strlen " << len << " < offset " << offset; + return false; + } + if (strncmp(static_cast<char*>(iov.iov_base), &str[offset], iov.iov_len)) { + absl::string_view iovec_string(static_cast<char*>(iov.iov_base), + iov.iov_len); + *result_listener << iovec_string << " @offset " << offset; + return false; + } + offset += iov.iov_len; + } + return true; +} + extern const char kReadvTestData[] = "127.0.0.1 localhost" "" diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc index 9b6972201..dd6fb7008 100644 --- a/test/syscalls/linux/readv_socket.cc +++ b/test/syscalls/linux/readv_socket.cc @@ -19,7 +19,6 @@ #include <unistd.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/file_base.h" #include "test/syscalls/linux/readv_common.h" #include "test/util/test_util.h" @@ -28,9 +27,30 @@ namespace testing { namespace { -class ReadvSocketTest : public SocketTest { +class ReadvSocketTest : public ::testing::Test { + public: void SetUp() override { - SocketTest::SetUp(); + test_unix_stream_socket_[0] = -1; + test_unix_stream_socket_[1] = -1; + test_unix_dgram_socket_[0] = -1; + test_unix_dgram_socket_[1] = -1; + test_unix_seqpacket_socket_[0] = -1; + test_unix_seqpacket_socket_[1] = -1; + + ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, test_unix_stream_socket_), + SyscallSucceeds()); + ASSERT_THAT(fcntl(test_unix_stream_socket_[0], F_SETFL, O_NONBLOCK), + SyscallSucceeds()); + ASSERT_THAT(socketpair(AF_UNIX, SOCK_DGRAM, 0, test_unix_dgram_socket_), + SyscallSucceeds()); + ASSERT_THAT(fcntl(test_unix_dgram_socket_[0], F_SETFL, O_NONBLOCK), + SyscallSucceeds()); + ASSERT_THAT( + socketpair(AF_UNIX, SOCK_SEQPACKET, 0, test_unix_seqpacket_socket_), + SyscallSucceeds()); + ASSERT_THAT(fcntl(test_unix_seqpacket_socket_[0], F_SETFL, O_NONBLOCK), + SyscallSucceeds()); + ASSERT_THAT( write(test_unix_stream_socket_[1], kReadvTestData, kReadvTestDataSize), SyscallSucceedsWithValue(kReadvTestDataSize)); @@ -40,11 +60,22 @@ class ReadvSocketTest : public SocketTest { ASSERT_THAT(write(test_unix_seqpacket_socket_[1], kReadvTestData, kReadvTestDataSize), SyscallSucceedsWithValue(kReadvTestDataSize)); - // FIXME(b/69821513): Enable when possible. - // ASSERT_THAT(write(test_tcp_socket_[1], kReadvTestData, - // kReadvTestDataSize), - // SyscallSucceedsWithValue(kReadvTestDataSize)); } + + void TearDown() override { + close(test_unix_stream_socket_[0]); + close(test_unix_stream_socket_[1]); + + close(test_unix_dgram_socket_[0]); + close(test_unix_dgram_socket_[1]); + + close(test_unix_seqpacket_socket_[0]); + close(test_unix_seqpacket_socket_[1]); + } + + int test_unix_stream_socket_[2]; + int test_unix_dgram_socket_[2]; + int test_unix_seqpacket_socket_[2]; }; TEST_F(ReadvSocketTest, ReadOneBufferPerByte_StreamSocket) { diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc new file mode 100644 index 000000000..106c045e3 --- /dev/null +++ b/test/syscalls/linux/rseq.cc @@ -0,0 +1,198 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <errno.h> +#include <signal.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "gtest/gtest.h" +#include "test/syscalls/linux/rseq/test.h" +#include "test/syscalls/linux/rseq/uapi.h" +#include "test/util/logging.h" +#include "test/util/multiprocess_util.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +// Syscall test for rseq (restartable sequences). +// +// We must be very careful about how these tests are written. Each thread may +// only have one struct rseq registration, which may be done automatically at +// thread start (as of 2019-11-13, glibc does *not* support rseq and thus does +// not do so). +// +// Testing of rseq is thus done primarily in a child process with no +// registration. This means exec'ing a nostdlib binary, as rseq registration can +// only be cleared by execve (or knowing the old rseq address), and glibc (based +// on the current unmerged patches) register rseq before calling main()). + +int RSeq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) { + return syscall(kRseqSyscall, rseq, rseq_len, flags, sig); +} + +// Returns true if this kernel supports the rseq syscall. +PosixErrorOr<bool> RSeqSupported() { + // We have to be careful here, there are three possible cases: + // + // 1. rseq is not supported -> ENOSYS + // 2. rseq is supported and not registered -> success, but we should + // unregister. + // 3. rseq is supported and registered -> EINVAL (most likely). + + // The only validation done on new registrations is that rseq is aligned and + // writable. + rseq rseq = {}; + int ret = RSeq(&rseq, sizeof(rseq), 0, 0); + if (ret == 0) { + // Successfully registered, rseq is supported. Unregister. + ret = RSeq(&rseq, sizeof(rseq), kRseqFlagUnregister, 0); + if (ret != 0) { + return PosixError(errno); + } + return true; + } + + switch (errno) { + case ENOSYS: + // Not supported. + return false; + case EINVAL: + // Supported, but already registered. EINVAL returned because we provided + // a different address. + return true; + default: + // Unknown error. + return PosixError(errno); + } +} + +constexpr char kRseqBinary[] = "test/syscalls/linux/rseq/rseq"; + +void RunChildTest(std::string test_case, int want_status) { + std::string path = RunfilePath(kRseqBinary); + + pid_t child_pid = -1; + int execve_errno = 0; + auto cleanup = ASSERT_NO_ERRNO_AND_VALUE( + ForkAndExec(path, {path, test_case}, {}, &child_pid, &execve_errno)); + + ASSERT_GT(child_pid, 0); + ASSERT_EQ(execve_errno, 0); + + int status = 0; + ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds()); + ASSERT_EQ(status, want_status); +} + +// Test that rseq must be aligned. +TEST(RseqTest, Unaligned) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestUnaligned, 0); +} + +// Sanity test that registration works. +TEST(RseqTest, Register) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestRegister, 0); +} + +// Registration can't be done twice. +TEST(RseqTest, DoubleRegister) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestDoubleRegister, 0); +} + +// Registration can be done again after unregister. +TEST(RseqTest, RegisterUnregister) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestRegisterUnregister, 0); +} + +// The pointer to rseq must match on register/unregister. +TEST(RseqTest, UnregisterDifferentPtr) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestUnregisterDifferentPtr, 0); +} + +// The signature must match on register/unregister. +TEST(RseqTest, UnregisterDifferentSignature) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestUnregisterDifferentSignature, 0); +} + +// The CPU ID is initialized. +TEST(RseqTest, CPU) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestCPU, 0); +} + +// Critical section is eventually aborted. +TEST(RseqTest, Abort) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestAbort, 0); +} + +// Abort may be before the critical section. +TEST(RseqTest, AbortBefore) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestAbortBefore, 0); +} + +// Signature must match. +TEST(RseqTest, AbortSignature) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestAbortSignature, SIGSEGV); +} + +// Abort must not be in the critical section. +TEST(RseqTest, AbortPreCommit) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestAbortPreCommit, SIGSEGV); +} + +// rseq.rseq_cs is cleared on abort. +TEST(RseqTest, AbortClearsCS) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestAbortClearsCS, 0); +} + +// rseq.rseq_cs is cleared on abort outside of critical section. +TEST(RseqTest, InvalidAbortClearsCS) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported())); + + RunChildTest(kRseqTestInvalidAbortClearsCS, 0); +} + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD new file mode 100644 index 000000000..5cfe4e56f --- /dev/null +++ b/test/syscalls/linux/rseq/BUILD @@ -0,0 +1,59 @@ +# This package contains a standalone rseq test binary. This binary must not +# depend on libc, which might use rseq itself. + +load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier") +load("@rules_cc//cc:defs.bzl", "cc_library") + +package(licenses = ["notice"]) + +genrule( + name = "rseq_binary", + srcs = [ + "critical.h", + "critical.S", + "rseq.cc", + "syscalls.h", + "start.S", + "test.h", + "types.h", + "uapi.h", + ], + outs = ["rseq"], + cmd = " ".join([ + "$(CC)", + "$(CC_FLAGS) ", + "-I.", + "-Wall", + "-Werror", + "-O2", + "-std=c++17", + "-static", + "-nostdlib", + "-ffreestanding", + "-o", + "$(location rseq)", + "$(location critical.S)", + "$(location rseq.cc)", + "$(location start.S)", + ]), + toolchains = [ + ":no_pie_cc_flags", + "@bazel_tools//tools/cpp:current_cc_toolchain", + ], + visibility = ["//:sandbox"], +) + +cc_flags_supplier( + name = "no_pie_cc_flags", + features = ["-pie"], +) + +cc_library( + name = "lib", + testonly = 1, + hdrs = [ + "test.h", + "uapi.h", + ], + visibility = ["//:sandbox"], +) diff --git a/test/syscalls/linux/rseq/critical.S b/test/syscalls/linux/rseq/critical.S new file mode 100644 index 000000000..8c0687e6d --- /dev/null +++ b/test/syscalls/linux/rseq/critical.S @@ -0,0 +1,66 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Restartable sequences critical sections. + +// Loops continuously until aborted. +// +// void rseq_loop(struct rseq* r, struct rseq_cs* cs) + + .text + .globl rseq_loop + .type rseq_loop, @function + +rseq_loop: + jmp begin + + // Abort block before the critical section. + // Abort signature is 4 nops for simplicity. + .byte 0x90, 0x90, 0x90, 0x90 + .globl rseq_loop_early_abort +rseq_loop_early_abort: + ret + +begin: + // r->rseq_cs = cs + movq %rsi, 8(%rdi) + + // N.B. rseq_cs will be cleared by any preempt, even outside the critical + // section. Thus it must be set in or immediately before the critical section + // to ensure it is not cleared before the section begins. + .globl rseq_loop_start +rseq_loop_start: + jmp rseq_loop_start + + // "Pre-commit": extra instructions inside the critical section. These are + // used as the abort point in TestAbortPreCommit, which is not valid. + .globl rseq_loop_pre_commit +rseq_loop_pre_commit: + // Extra abort signature + nop for TestAbortPostCommit. + .byte 0x90, 0x90, 0x90, 0x90 + nop + + // "Post-commit": never reached in this case. + .globl rseq_loop_post_commit +rseq_loop_post_commit: + + // Abort signature is 4 nops for simplicity. + .byte 0x90, 0x90, 0x90, 0x90 + + .globl rseq_loop_abort +rseq_loop_abort: + ret + + .size rseq_loop,.-rseq_loop + .section .note.GNU-stack,"",@progbits diff --git a/test/syscalls/linux/rseq/critical.h b/test/syscalls/linux/rseq/critical.h new file mode 100644 index 000000000..ac987a25e --- /dev/null +++ b/test/syscalls/linux/rseq/critical.h @@ -0,0 +1,39 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_ +#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_ + +#include "test/syscalls/linux/rseq/types.h" +#include "test/syscalls/linux/rseq/uapi.h" + +constexpr uint32_t kRseqSignature = 0x90909090; + +extern "C" { + +extern void rseq_loop(struct rseq* r, struct rseq_cs* cs); +extern void* rseq_loop_early_abort; +extern void* rseq_loop_start; +extern void* rseq_loop_pre_commit; +extern void* rseq_loop_post_commit; +extern void* rseq_loop_abort; + +extern int rseq_getpid(struct rseq* r, struct rseq_cs* cs); +extern void* rseq_getpid_start; +extern void* rseq_getpid_post_commit; +extern void* rseq_getpid_abort; + +} // extern "C" + +#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_ diff --git a/test/syscalls/linux/rseq/rseq.cc b/test/syscalls/linux/rseq/rseq.cc new file mode 100644 index 000000000..f036db26d --- /dev/null +++ b/test/syscalls/linux/rseq/rseq.cc @@ -0,0 +1,366 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/syscalls/linux/rseq/critical.h" +#include "test/syscalls/linux/rseq/syscalls.h" +#include "test/syscalls/linux/rseq/test.h" +#include "test/syscalls/linux/rseq/types.h" +#include "test/syscalls/linux/rseq/uapi.h" + +namespace gvisor { +namespace testing { + +extern "C" int main(int argc, char** argv, char** envp); + +// Standalone initialization before calling main(). +extern "C" void __init(uintptr_t* sp) { + int argc = sp[0]; + char** argv = reinterpret_cast<char**>(&sp[1]); + char** envp = &argv[argc + 1]; + + // Call main() and exit. + sys_exit_group(main(argc, argv, envp)); + + // sys_exit_group does not return +} + +int strcmp(const char* s1, const char* s2) { + const unsigned char* p1 = reinterpret_cast<const unsigned char*>(s1); + const unsigned char* p2 = reinterpret_cast<const unsigned char*>(s2); + + while (*p1 == *p2) { + if (!*p1) { + return 0; + } + ++p1; + ++p2; + } + return static_cast<int>(*p1) - static_cast<int>(*p2); +} + +int sys_rseq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) { + return raw_syscall(kRseqSyscall, rseq, rseq_len, flags, sig); +} + +// Test that rseq must be aligned. +int TestUnaligned() { + constexpr uintptr_t kRequiredAlignment = alignof(rseq); + + char buf[2 * kRequiredAlignment] = {}; + uintptr_t ptr = reinterpret_cast<uintptr_t>(&buf[0]); + if ((ptr & (kRequiredAlignment - 1)) == 0) { + // buf is already aligned. Misalign it. + ptr++; + } + + int ret = sys_rseq(reinterpret_cast<rseq*>(ptr), sizeof(rseq), 0, 0); + if (sys_errno(ret) != EINVAL) { + return 1; + } + return 0; +} + +// Sanity test that registration works. +int TestRegister() { + struct rseq r = {}; + if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) { + return 1; + } + return 0; +}; + +// Registration can't be done twice. +int TestDoubleRegister() { + struct rseq r = {}; + if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) { + return 1; + } + + if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != EBUSY) { + return 1; + } + + return 0; +}; + +// Registration can be done again after unregister. +int TestRegisterUnregister() { + struct rseq r = {}; + if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) { + return 1; + } + + if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, 0); + sys_errno(ret) != 0) { + return 1; + } + + if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) { + return 1; + } + + return 0; +}; + +// The pointer to rseq must match on register/unregister. +int TestUnregisterDifferentPtr() { + struct rseq r = {}; + if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) { + return 1; + } + + struct rseq r2 = {}; + if (int ret = sys_rseq(&r2, sizeof(r2), kRseqFlagUnregister, 0); + sys_errno(ret) != EINVAL) { + return 1; + } + + return 0; +}; + +// The signature must match on register/unregister. +int TestUnregisterDifferentSignature() { + constexpr int kSignature = 0; + + struct rseq r = {}; + if (int ret = sys_rseq(&r, sizeof(r), 0, kSignature); sys_errno(ret) != 0) { + return 1; + } + + if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, kSignature + 1); + sys_errno(ret) != EPERM) { + return 1; + } + + return 0; +}; + +// The CPU ID is initialized. +int TestCPU() { + struct rseq r = {}; + r.cpu_id = kRseqCPUIDUninitialized; + + if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) { + return 1; + } + + if (__atomic_load_n(&r.cpu_id, __ATOMIC_RELAXED) < 0) { + return 1; + } + if (__atomic_load_n(&r.cpu_id_start, __ATOMIC_RELAXED) < 0) { + return 1; + } + + return 0; +}; + +// Critical section is eventually aborted. +int TestAbort() { + struct rseq r = {}; + if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature); + sys_errno(ret) != 0) { + return 1; + } + + struct rseq_cs cs = {}; + cs.version = 0; + cs.flags = 0; + cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) - + reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort); + + // Loops until abort. If this returns then abort occurred. + rseq_loop(&r, &cs); + + return 0; +}; + +// Abort may be before the critical section. +int TestAbortBefore() { + struct rseq r = {}; + if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature); + sys_errno(ret) != 0) { + return 1; + } + + struct rseq_cs cs = {}; + cs.version = 0; + cs.flags = 0; + cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) - + reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_early_abort); + + // Loops until abort. If this returns then abort occurred. + rseq_loop(&r, &cs); + + return 0; +}; + +// Signature must match. +int TestAbortSignature() { + struct rseq r = {}; + if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1); + sys_errno(ret) != 0) { + return 1; + } + + struct rseq_cs cs = {}; + cs.version = 0; + cs.flags = 0; + cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) - + reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort); + + // Loops until abort. This should SIGSEGV on abort. + rseq_loop(&r, &cs); + + return 1; +}; + +// Abort must not be in the critical section. +int TestAbortPreCommit() { + struct rseq r = {}; + if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1); + sys_errno(ret) != 0) { + return 1; + } + + struct rseq_cs cs = {}; + cs.version = 0; + cs.flags = 0; + cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) - + reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_pre_commit); + + // Loops until abort. This should SIGSEGV on abort. + rseq_loop(&r, &cs); + + return 1; +}; + +// rseq.rseq_cs is cleared on abort. +int TestAbortClearsCS() { + struct rseq r = {}; + if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature); + sys_errno(ret) != 0) { + return 1; + } + + struct rseq_cs cs = {}; + cs.version = 0; + cs.flags = 0; + cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) - + reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort); + + // Loops until abort. If this returns then abort occurred. + rseq_loop(&r, &cs); + + if (__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) { + return 1; + } + + return 0; +}; + +// rseq.rseq_cs is cleared on abort outside of critical section. +int TestInvalidAbortClearsCS() { + struct rseq r = {}; + if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature); + sys_errno(ret) != 0) { + return 1; + } + + struct rseq_cs cs = {}; + cs.version = 0; + cs.flags = 0; + cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) - + reinterpret_cast<uint64_t>(&rseq_loop_start); + cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort); + + __atomic_store_n(&r.rseq_cs, &cs, __ATOMIC_RELAXED); + + // When the next abort condition occurs, the kernel will clear cs once it + // determines we aren't in the critical section. + while (1) { + if (!__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) { + break; + } + } + + return 0; +}; + +// Exit codes: +// 0 - Pass +// 1 - Fail +// 2 - Missing argument +// 3 - Unknown test case +extern "C" int main(int argc, char** argv, char** envp) { + if (argc != 2) { + // Usage: rseq <test case> + return 2; + } + + if (strcmp(argv[1], kRseqTestUnaligned) == 0) { + return TestUnaligned(); + } + if (strcmp(argv[1], kRseqTestRegister) == 0) { + return TestRegister(); + } + if (strcmp(argv[1], kRseqTestDoubleRegister) == 0) { + return TestDoubleRegister(); + } + if (strcmp(argv[1], kRseqTestRegisterUnregister) == 0) { + return TestRegisterUnregister(); + } + if (strcmp(argv[1], kRseqTestUnregisterDifferentPtr) == 0) { + return TestUnregisterDifferentPtr(); + } + if (strcmp(argv[1], kRseqTestUnregisterDifferentSignature) == 0) { + return TestUnregisterDifferentSignature(); + } + if (strcmp(argv[1], kRseqTestCPU) == 0) { + return TestCPU(); + } + if (strcmp(argv[1], kRseqTestAbort) == 0) { + return TestAbort(); + } + if (strcmp(argv[1], kRseqTestAbortBefore) == 0) { + return TestAbortBefore(); + } + if (strcmp(argv[1], kRseqTestAbortSignature) == 0) { + return TestAbortSignature(); + } + if (strcmp(argv[1], kRseqTestAbortPreCommit) == 0) { + return TestAbortPreCommit(); + } + if (strcmp(argv[1], kRseqTestAbortClearsCS) == 0) { + return TestAbortClearsCS(); + } + if (strcmp(argv[1], kRseqTestInvalidAbortClearsCS) == 0) { + return TestInvalidAbortClearsCS(); + } + + return 3; +} + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/rseq/start.S b/test/syscalls/linux/rseq/start.S new file mode 100644 index 000000000..b9611b276 --- /dev/null +++ b/test/syscalls/linux/rseq/start.S @@ -0,0 +1,45 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + + .text + .align 4 + .type _start,@function + .globl _start + +_start: + movq %rsp,%rdi + call __init + hlt + + .size _start,.-_start + .section .note.GNU-stack,"",@progbits + + .text + .globl raw_syscall + .type raw_syscall, @function + +raw_syscall: + mov %rdi,%rax // syscall # + mov %rsi,%rdi // arg0 + mov %rdx,%rsi // arg1 + mov %rcx,%rdx // arg2 + mov %r8,%r10 // arg3 (goes in r10 instead of rcx for system calls) + mov %r9,%r8 // arg4 + mov 0x8(%rsp),%r9 // arg5 + syscall + ret + + .size raw_syscall,.-raw_syscall + .section .note.GNU-stack,"",@progbits diff --git a/test/syscalls/linux/rseq/syscalls.h b/test/syscalls/linux/rseq/syscalls.h new file mode 100644 index 000000000..e5299c188 --- /dev/null +++ b/test/syscalls/linux/rseq/syscalls.h @@ -0,0 +1,66 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_ +#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_ + +#include "test/syscalls/linux/rseq/types.h" + +#ifdef __x86_64__ +// Syscall numbers. +constexpr int kGetpid = 39; +constexpr int kExitGroup = 231; +#else +#error "Unknown architecture" +#endif + +namespace gvisor { +namespace testing { + +// Standalone system call interfaces. +// Note that these are all "raw" system call interfaces which encode +// errors by setting the return value to a small negative number. +// Use sys_errno() to check system call return values for errors. + +// Maximum Linux error number. +constexpr int kMaxErrno = 4095; + +// Errno values. +#define EPERM 1 +#define EFAULT 14 +#define EBUSY 16 +#define EINVAL 22 + +// Get the error number from a raw system call return value. +// Returns a positive error number or 0 if there was no error. +static inline int sys_errno(uintptr_t rval) { + if (rval >= static_cast<uintptr_t>(-kMaxErrno)) { + return -static_cast<int>(rval); + } + return 0; +} + +extern "C" uintptr_t raw_syscall(int number, ...); + +static inline void sys_exit_group(int status) { + raw_syscall(kExitGroup, status); +} +static inline int sys_getpid() { + return static_cast<int>(raw_syscall(kGetpid)); +} + +} // namespace testing +} // namespace gvisor + +#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_ diff --git a/test/syscalls/linux/rseq/test.h b/test/syscalls/linux/rseq/test.h new file mode 100644 index 000000000..3b7bb74b1 --- /dev/null +++ b/test/syscalls/linux/rseq/test.h @@ -0,0 +1,43 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_ +#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_ + +namespace gvisor { +namespace testing { + +// Test cases supported by rseq binary. + +inline constexpr char kRseqTestUnaligned[] = "unaligned"; +inline constexpr char kRseqTestRegister[] = "register"; +inline constexpr char kRseqTestDoubleRegister[] = "double-register"; +inline constexpr char kRseqTestRegisterUnregister[] = "register-unregister"; +inline constexpr char kRseqTestUnregisterDifferentPtr[] = + "unregister-different-ptr"; +inline constexpr char kRseqTestUnregisterDifferentSignature[] = + "unregister-different-signature"; +inline constexpr char kRseqTestCPU[] = "cpu"; +inline constexpr char kRseqTestAbort[] = "abort"; +inline constexpr char kRseqTestAbortBefore[] = "abort-before"; +inline constexpr char kRseqTestAbortSignature[] = "abort-signature"; +inline constexpr char kRseqTestAbortPreCommit[] = "abort-precommit"; +inline constexpr char kRseqTestAbortClearsCS[] = "abort-clears-cs"; +inline constexpr char kRseqTestInvalidAbortClearsCS[] = + "invalid-abort-clears-cs"; + +} // namespace testing +} // namespace gvisor + +#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_ diff --git a/test/syscalls/linux/rseq/types.h b/test/syscalls/linux/rseq/types.h new file mode 100644 index 000000000..b6afe9817 --- /dev/null +++ b/test/syscalls/linux/rseq/types.h @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_ +#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_ + +using size_t = __SIZE_TYPE__; +using uintptr_t = __UINTPTR_TYPE__; + +using uint8_t = __UINT8_TYPE__; +using uint16_t = __UINT16_TYPE__; +using uint32_t = __UINT32_TYPE__; +using uint64_t = __UINT64_TYPE__; + +using int8_t = __INT8_TYPE__; +using int16_t = __INT16_TYPE__; +using int32_t = __INT32_TYPE__; +using int64_t = __INT64_TYPE__; + +#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_ diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h new file mode 100644 index 000000000..e3ff0579a --- /dev/null +++ b/test/syscalls/linux/rseq/uapi.h @@ -0,0 +1,54 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_ +#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_ + +// User-kernel ABI for restartable sequences. + +// Standard types. +// +// N.B. This header will be included in targets that do have the standard +// library, so we can't shadow the standard type names. +using __u32 = __UINT32_TYPE__; +using __u64 = __UINT64_TYPE__; + +#ifdef __x86_64__ +// Syscall numbers. +constexpr int kRseqSyscall = 334; +#else +#error "Unknown architecture" +#endif // __x86_64__ + +struct rseq_cs { + __u32 version; + __u32 flags; + __u64 start_ip; + __u64 post_commit_offset; + __u64 abort_ip; +} __attribute__((aligned(4 * sizeof(__u64)))); + +// N.B. alignment is enforced by the kernel. +struct rseq { + __u32 cpu_id_start; + __u32 cpu_id; + struct rseq_cs* rseq_cs; + __u32 flags; +} __attribute__((aligned(4 * sizeof(__u64)))); + +constexpr int kRseqFlagUnregister = 1 << 0; + +constexpr int kRseqCPUIDUninitialized = -1; + +#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_ diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc index 6fd3989a4..a778fa639 100644 --- a/test/syscalls/linux/sigaltstack.cc +++ b/test/syscalls/linux/sigaltstack.cc @@ -95,13 +95,7 @@ TEST(SigaltstackTest, ResetByExecve) { auto const cleanup_sigstack = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack)); - std::string full_path; - char* test_src = getenv("TEST_SRCDIR"); - if (test_src) { - full_path = JoinPath(test_src, "../../linux/sigaltstack_check"); - } - - ASSERT_FALSE(full_path.empty()); + std::string full_path = RunfilePath("test/syscalls/linux/sigaltstack_check"); pid_t child_pid = -1; int execve_errno = 0; diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc index e4641c62e..34b1058a9 100644 --- a/test/syscalls/linux/socket_bind_to_device_sequence.cc +++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc @@ -97,12 +97,22 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> { sockets_to_close_.erase(socket_id); } - // Bind a socket with the reuse option and bind_to_device options. Checks + // SetDevice changes the bind_to_device option. It does not bind or re-bind. + void SetDevice(int socket_id, int device_id) { + auto socket_fd = sockets_to_close_[socket_id]->get(); + string device_name; + ASSERT_NO_FATAL_FAILURE(GetDevice(device_id, &device_name)); + EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, + device_name.c_str(), device_name.size() + 1), + SyscallSucceedsWithValue(0)); + } + + // Bind a socket with the reuse options and bind_to_device options. Checks // that all steps succeed and that the bind command's error matches want. // Sets the socket_id to uniquely identify the socket bound if it is not // nullptr. - void BindSocket(bool reuse, int device_id = 0, int want = 0, - int *socket_id = nullptr) { + void BindSocket(bool reuse_port, bool reuse_addr, int device_id = 0, + int want = 0, int *socket_id = nullptr) { next_socket_id_++; sockets_to_close_[next_socket_id_] = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto socket_fd = sockets_to_close_[next_socket_id_]->get(); @@ -110,13 +120,20 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> { *socket_id = next_socket_id_; } - // If reuse is indicated, do that. - if (reuse) { + // If reuse_port is indicated, do that. + if (reuse_port) { EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn, sizeof(kSockOptOn)), SyscallSucceedsWithValue(0)); } + // If reuse_addr is indicated, do that. + if (reuse_addr) { + EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, + sizeof(kSockOptOn)), + SyscallSucceedsWithValue(0)); + } + // If the device is non-zero, bind to that device. if (device_id != 0) { string device_name; @@ -182,129 +199,308 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> { }; TEST_P(BindToDeviceSequenceTest, BindTwiceWithDeviceFails) { - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 3)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 3, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 3, EADDRINUSE)); } TEST_P(BindToDeviceSequenceTest, BindToDevice) { - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 1)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 2)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 1)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 2)); } TEST_P(BindToDeviceSequenceTest, BindToDeviceAndThenWithoutDevice) { - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 123)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 123)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); } TEST_P(BindToDeviceSequenceTest, BindWithoutDevice) { - ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ false)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 123, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 123, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); } TEST_P(BindToDeviceSequenceTest, BindWithDevice) { - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 123, 0)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 456, 0)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 789, 0)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 123, 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 123, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 123, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 456, 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 789, 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); } TEST_P(BindToDeviceSequenceTest, BindWithReuse) { - ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE)); ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 123)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true, /* bind_to_device */ 0)); + BindSocket(/* reusePort */ true, /* reuse_addr */ false)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 123, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ false, + /* bind_to_device */ 123)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0)); } TEST_P(BindToDeviceSequenceTest, BindingWithReuseAndDevice) { - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 123)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 123)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 456)); - ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 789)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 999, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 123, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 456)); + ASSERT_NO_FATAL_FAILURE( + BindSocket(/* reuse_port */ true, /* reuse_addr */ false)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 999, EADDRINUSE)); } TEST_P(BindToDeviceSequenceTest, MixingReuseAndNotReuseByBindingToDevice) { - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 123, 0)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 456, 0)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 789, 0)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 999, 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 123, 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 456, 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 789, 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 999, 0)); } TEST_P(BindToDeviceSequenceTest, CannotBindTo0AfterMixingReuseAndNotReuse) { - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 123)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 456)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 456)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); } TEST_P(BindToDeviceSequenceTest, BindAndRelease) { - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 123)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123)); int to_release; - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 0, 0, &to_release)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 345, EADDRINUSE)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 789)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0, 0, &to_release)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 345, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789)); // Release the bind to device 0 and try again. ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 345)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 345)); } TEST_P(BindToDeviceSequenceTest, BindTwiceWithReuseOnce) { + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 123)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); +} + +TEST_P(BindToDeviceSequenceTest, BindWithReuseAddr) { + // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets. + SKIP_IF(IsRunningOnGvisor()); + ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ false, /* bind_to_device */ 123)); - ASSERT_NO_FATAL_FAILURE( - BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE)); + BindSocket(/* reusePort */ false, /* reuse_addr */ true)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 123, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 123)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0)); +} + +TEST_P(BindToDeviceSequenceTest, + CannotBindTo0AfterMixingReuseAddrAndNotReuseAddr) { + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 456)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ true, + /* bind_to_device */ 0, EADDRINUSE)); +} + +TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReusePort) { + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ true, + /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ true, + /* bind_to_device */ 0, EADDRINUSE)); +} + +TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReuseAddr) { + // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets. + SKIP_IF(IsRunningOnGvisor()); + + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ true, + /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ true, + /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); +} + +TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReusePort) { + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ true, + /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ true, + /* bind_to_device */ 0, EADDRINUSE)); +} + +TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReuseAddr) { + // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets. + SKIP_IF(IsRunningOnGvisor()); + + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ true, + /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ true, + /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); +} + +TEST_P(BindToDeviceSequenceTest, BindReusePortThenReuseAddrReusePort) { + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ true, + /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ true, + /* bind_to_device */ 0, EADDRINUSE)); +} + +TEST_P(BindToDeviceSequenceTest, BindReuseAddrThenReuseAddr) { + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0, EADDRINUSE)); +} + +// This behavior seems like a bug? +TEST_P(BindToDeviceSequenceTest, + BindReuseAddrThenReuseAddrReusePortThenReuseAddr) { + // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets. + SKIP_IF(IsRunningOnGvisor()); + + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ true, + /* bind_to_device */ 0)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true, + /* reuse_addr */ false, + /* bind_to_device */ 0)); +} + +// Repro test for gvisor.dev/issue/1217. Not replicated in ports_test.go as this +// test is different from the others and wouldn't fit well there. +TEST_P(BindToDeviceSequenceTest, BindAndReleaseDifferentDevice) { + int to_release; + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 3, 0, &to_release)); + ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false, + /* reuse_addr */ false, + /* bind_to_device */ 3, EADDRINUSE)); + // Change the device. Since the socket was already bound, this should have no + // effect. + SetDevice(to_release, 2); + // Release the bind to device 3 and try again. + ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release)); + ASSERT_NO_FATAL_FAILURE(BindSocket( + /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3)); } INSTANTIATE_TEST_SUITE_P(BindToDeviceTest, BindToDeviceSequenceTest, diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc index 96a1731cf..619d41901 100644 --- a/test/syscalls/linux/socket_inet_loopback.cc +++ b/test/syscalls/linux/socket_inet_loopback.cc @@ -13,12 +13,10 @@ // limitations under the License. #include <arpa/inet.h> -#include <linux/tcp.h> #include <netinet/in.h> +#include <netinet/tcp.h> #include <poll.h> #include <string.h> -#include <sys/epoll.h> -#include <sys/socket.h> #include <atomic> #include <iostream> @@ -46,6 +44,8 @@ namespace testing { namespace { +using ::testing::Gt; + PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) { switch (family) { case AF_INET: @@ -102,19 +102,17 @@ TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) { SyscallFailsWithErrno(EAFNOSUPPORT)); } -TEST_P(SocketInetLoopbackTest, TCP) { - auto const& param = GetParam(); - - TestAddress const& listener = param.listener; - TestAddress const& connector = param.connector; - +void tcpSimpleConnectTest(TestAddress const& listener, + TestAddress const& connector, bool unbound) { // Create the listening socket. const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); sockaddr_storage listen_addr = listener.addr; - ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr), - listener.addr_len), - SyscallSucceeds()); + if (!unbound) { + ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr), + listener.addr_len), + SyscallSucceeds()); + } ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); // Get the port bound by the listening socket. @@ -148,6 +146,23 @@ TEST_P(SocketInetLoopbackTest, TCP) { ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds()); } +TEST_P(SocketInetLoopbackTest, TCP) { + auto const& param = GetParam(); + TestAddress const& listener = param.listener; + TestAddress const& connector = param.connector; + + tcpSimpleConnectTest(listener, connector, true); +} + +TEST_P(SocketInetLoopbackTest, TCPListenUnbound) { + auto const& param = GetParam(); + + TestAddress const& listener = param.listener; + TestAddress const& connector = param.connector; + + tcpSimpleConnectTest(listener, connector, false); +} + TEST_P(SocketInetLoopbackTest, TCPListenClose) { auto const& param = GetParam(); @@ -206,7 +221,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) { } // TODO(b/138400178): Fix cooperative S/R failure when ds.reset() is invoked // before function end. - // ds.reset() + // ds.reset(); } TEST_P(SocketInetLoopbackTest, TCPbacklog) { @@ -603,6 +618,60 @@ TEST_P(SocketInetLoopbackTest, TCPTimeWaitTest_NoRandomSave) { SyscallSucceeds()); } +TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) { + auto const& param = GetParam(); + TestAddress const& listener = param.listener; + TestAddress const& connector = param.connector; + + // Create the listening socket. + const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); + sockaddr_storage listen_addr = listener.addr; + ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr), + listener.addr_len), + SyscallSucceeds()); + ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); + + // Get the port bound by the listening socket. + socklen_t addrlen = listener.addr_len; + ASSERT_THAT(getsockname(listen_fd.get(), + reinterpret_cast<sockaddr*>(&listen_addr), &addrlen), + SyscallSucceeds()); + + const uint16_t port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); + + // Set the userTimeout on the listening socket. + constexpr int kUserTimeout = 10; + ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, + &kUserTimeout, sizeof(kUserTimeout)), + SyscallSucceeds()); + + // Connect to the listening socket. + FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); + + sockaddr_storage conn_addr = connector.addr; + ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); + ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), + reinterpret_cast<sockaddr*>(&conn_addr), + connector.addr_len), + SyscallSucceeds()); + + // Accept the connection. + auto accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); + // Verify that the accepted socket inherited the user timeout set on + // listening socket. + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT( + getsockopt(accepted.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len), + SyscallSucceeds()); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, kUserTimeout); +} + INSTANTIATE_TEST_SUITE_P( All, SocketInetLoopbackTest, ::testing::Values( @@ -635,7 +704,9 @@ INSTANTIATE_TEST_SUITE_P( using SocketInetReusePortTest = ::testing::TestWithParam<TestParam>; -TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) { +// TODO(gvisor.dev/issue/940): Remove _NoRandomSave when portHint/stack.Seed is +// saved/restored. +TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) { auto const& param = GetParam(); TestAddress const& listener = param.listener; @@ -643,6 +714,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) { sockaddr_storage listen_addr = listener.addr; sockaddr_storage conn_addr = connector.addr; constexpr int kThreadCount = 3; + constexpr int kConnectAttempts = 4096; // Create the listening socket. FileDescriptor listener_fds[kThreadCount]; @@ -657,7 +729,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) { ASSERT_THAT( bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len), SyscallSucceeds()); - ASSERT_THAT(listen(fd, 40), SyscallSucceeds()); + ASSERT_THAT(listen(fd, kConnectAttempts / 3), SyscallSucceeds()); // On the first bind we need to determine which port was bound. if (i != 0) { @@ -676,7 +748,6 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) { ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); } - constexpr int kConnectAttempts = 10000; std::atomic<int> connects_received = ATOMIC_VAR_INIT(0); std::unique_ptr<ScopedThread> listen_thread[kThreadCount]; int accept_counts[kThreadCount] = {}; @@ -920,41 +991,44 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) { SyscallSucceedsWithValue(sizeof(i))); } - int epollfd; - ASSERT_THAT(epollfd = epoll_create1(0), SyscallSucceeds()); - + struct pollfd pollfds[kThreadCount]; for (int i = 0; i < kThreadCount; i++) { - int fd = listener_fds[i].get(); - struct epoll_event ev; - ev.data.fd = fd; - ev.events = EPOLLIN; - ASSERT_THAT(epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev), SyscallSucceeds()); + pollfds[i].fd = listener_fds[i].get(); + pollfds[i].events = POLLIN; } std::map<uint16_t, int> portToFD; - for (int i = 0; i < kConnectAttempts * 2; i++) { - struct sockaddr_storage addr = {}; - socklen_t addrlen = sizeof(addr); - struct epoll_event ev; - int data, fd; + int received = 0; + while (received < kConnectAttempts * 2) { + ASSERT_THAT(poll(pollfds, kThreadCount, -1), + SyscallSucceedsWithValue(Gt(0))); - ASSERT_THAT(epoll_wait(epollfd, &ev, 1, -1), SyscallSucceedsWithValue(1)); + for (int i = 0; i < kThreadCount; i++) { + if ((pollfds[i].revents & POLLIN) == 0) { + continue; + } - fd = ev.data.fd; - EXPECT_THAT(RetryEINTR(recvfrom)(fd, &data, sizeof(data), 0, - reinterpret_cast<struct sockaddr*>(&addr), - &addrlen), - SyscallSucceedsWithValue(sizeof(data))); - uint16_t const port = - ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr)); - auto prev_port = portToFD.find(port); - // Check that all packets from one client have been delivered to the same - // server socket. - if (prev_port == portToFD.end()) { - portToFD[port] = ev.data.fd; - } else { - EXPECT_EQ(portToFD[port], ev.data.fd); + received++; + + const int fd = pollfds[i].fd; + struct sockaddr_storage addr = {}; + socklen_t addrlen = sizeof(addr); + int data; + EXPECT_THAT(RetryEINTR(recvfrom)( + fd, &data, sizeof(data), 0, + reinterpret_cast<struct sockaddr*>(&addr), &addrlen), + SyscallSucceedsWithValue(sizeof(data))); + uint16_t const port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr)); + auto prev_port = portToFD.find(port); + // Check that all packets from one client have been delivered to the + // same server socket. + if (prev_port == portToFD.end()) { + portToFD[port] = fd; + } else { + EXPECT_EQ(portToFD[port], fd); + } } } } @@ -1841,7 +1915,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, NoReusePortFollowingReusePort) { } INSTANTIATE_TEST_SUITE_P( - AllFamlies, SocketMultiProtocolInetLoopbackTest, + AllFamilies, SocketMultiProtocolInetLoopbackTest, ::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM}, ProtocolTestParam{"UDP", SOCK_DGRAM}), DescribeProtocolTestParam); diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc index e58eedaba..fda252dd7 100644 --- a/test/syscalls/linux/socket_ip_loopback_blocking.cc +++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc @@ -23,6 +23,7 @@ namespace gvisor { namespace testing { +namespace { std::vector<SocketPairKind> GetSocketPairs() { return VecCat<SocketPairKind>( @@ -43,5 +44,6 @@ INSTANTIATE_TEST_SUITE_P( BlockingIPSockets, BlockingSocketPairTest, ::testing::ValuesIn(IncludeReversals(GetSocketPairs()))); +} // namespace } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc index a37b49447..57ce8e169 100644 --- a/test/syscalls/linux/socket_ip_tcp_generic.cc +++ b/test/syscalls/linux/socket_ip_tcp_generic.cc @@ -24,6 +24,8 @@ #include <sys/un.h> #include "gtest/gtest.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -789,5 +791,89 @@ TEST_P(TCPSocketPairTest, SetTCPLingerTimeout) { EXPECT_EQ(get, kTCPLingerTimeout); } +TEST_P(TCPSocketPairTest, TestTCPCloseWithData) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + ScopedThread t([&]() { + // Close one end to trigger sending of a FIN. + ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_WR), SyscallSucceeds()); + char buf[3]; + ASSERT_THAT(read(sockets->second_fd(), buf, 3), + SyscallSucceedsWithValue(3)); + absl::SleepFor(absl::Milliseconds(50)); + ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds()); + }); + + absl::SleepFor(absl::Milliseconds(50)); + // Send some data then close. + constexpr char kStr[] = "abc"; + ASSERT_THAT(write(sockets->first_fd(), kStr, 3), SyscallSucceedsWithValue(3)); + t.Join(); + ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds()); +} + +TEST_P(TCPSocketPairTest, TCPUserTimeoutDefault) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT, + &get, &get_len), + SyscallSucceeds()); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, 0); // 0 ms (disabled). +} + +TEST_P(TCPSocketPairTest, SetTCPUserTimeoutZero) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + constexpr int kZero = 0; + ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT, + &kZero, sizeof(kZero)), + SyscallSucceeds()); + + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT, + &get, &get_len), + SyscallSucceeds()); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, 0); // 0 ms (disabled). +} + +TEST_P(TCPSocketPairTest, SetTCPUserTimeoutBelowZero) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + constexpr int kNeg = -10; + EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT, + &kNeg, sizeof(kNeg)), + SyscallFailsWithErrno(EINVAL)); + + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT, + &get, &get_len), + SyscallSucceeds()); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, 0); // 0 ms (disabled). +} + +TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + constexpr int kAbove = 10; + ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT, + &kAbove, sizeof(kAbove)), + SyscallSucceeds()); + + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT, + &get, &get_len), + SyscallSucceeds()); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, kAbove); +} + } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/socket_ip_tcp_loopback.cc b/test/syscalls/linux/socket_ip_tcp_loopback.cc index 831de53b8..9db3037bc 100644 --- a/test/syscalls/linux/socket_ip_tcp_loopback.cc +++ b/test/syscalls/linux/socket_ip_tcp_loopback.cc @@ -21,6 +21,7 @@ namespace gvisor { namespace testing { +namespace { std::vector<SocketPairKind> GetSocketPairs() { return { @@ -34,5 +35,6 @@ INSTANTIATE_TEST_SUITE_P( AllUnixDomainSockets, AllSocketPairTest, ::testing::ValuesIn(IncludeReversals(GetSocketPairs()))); +} // namespace } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc index 6b1af6c17..aa6fb4e3f 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc +++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc @@ -1814,7 +1814,7 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReusePortThenReuseAddr) { SyscallFailsWithErrno(EADDRINUSE)); } -TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReusePort) { +TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReusePort) { auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); @@ -1855,7 +1855,7 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReusePort) { SyscallFailsWithErrno(EADDRINUSE)); } -TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReuseAddr) { +TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReuseAddr) { // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets. SKIP_IF(IsRunningOnGvisor()); diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc index 8a28202a8..4cf1f76f1 100644 --- a/test/syscalls/linux/socket_unix.cc +++ b/test/syscalls/linux/socket_unix.cc @@ -65,6 +65,21 @@ TEST_P(UnixSocketPairTest, BindToBadName) { SyscallFailsWithErrno(ENOENT)); } +TEST_P(UnixSocketPairTest, BindToBadFamily) { + auto pair = + ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create()); + + constexpr char kBadName[] = "/some/path/that/does/not/exist"; + sockaddr_un sockaddr; + sockaddr.sun_family = AF_INET; + memcpy(sockaddr.sun_path, kBadName, sizeof(kBadName)); + + EXPECT_THAT( + bind(pair->first_fd(), reinterpret_cast<struct sockaddr*>(&sockaddr), + sizeof(sockaddr)), + SyscallFailsWithErrno(EINVAL)); +} + TEST_P(UnixSocketPairTest, RecvmmsgTimeoutAfterRecv) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); char sent_data[10]; diff --git a/test/syscalls/linux/socket_unix_cmsg.cc b/test/syscalls/linux/socket_unix_cmsg.cc index 1159c5229..a16899493 100644 --- a/test/syscalls/linux/socket_unix_cmsg.cc +++ b/test/syscalls/linux/socket_unix_cmsg.cc @@ -149,6 +149,35 @@ TEST_P(UnixSocketPairCmsgTest, BadFDPass) { SyscallFailsWithErrno(EBADF)); } +TEST_P(UnixSocketPairCmsgTest, ShortCmsg) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + char sent_data[20]; + RandomizeBuffer(sent_data, sizeof(sent_data)); + + int sent_fd = -1; + + struct msghdr msg = {}; + char control[CMSG_SPACE(sizeof(sent_fd))]; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = 1; + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), &sent_fd, sizeof(sent_fd)); + + struct iovec iov; + iov.iov_base = sent_data; + iov.iov_len = sizeof(sent_data); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0), + SyscallFailsWithErrno(EINVAL)); +} + // BasicFDPassNoSpace starts off by sending a single FD just like BasicFDPass. // The difference is that when calling recvmsg, no space for FDs is provided, // only space for the cmsg header. diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc index 99863b0ed..6b99c021d 100644 --- a/test/syscalls/linux/tcp_socket.cc +++ b/test/syscalls/linux/tcp_socket.cc @@ -967,6 +967,78 @@ TEST_P(SimpleTcpSocketTest, BlockingConnectRefused) { EXPECT_THAT(close(s.release()), SyscallSucceeds()); } +// Test that connecting to a non-listening port and thus receiving a RST is +// handled appropriately by the socket - the port that the socket was bound to +// is released and the expected error is returned. +TEST_P(SimpleTcpSocketTest, CleanupOnConnectionRefused) { + // Create a socket that is known to not be listening. As is it bound but not + // listening, when another socket connects to the port, it will refuse.. + FileDescriptor bound_s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + sockaddr_storage bound_addr = + ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); + socklen_t bound_addrlen = sizeof(bound_addr); + + ASSERT_THAT( + bind(bound_s.get(), reinterpret_cast<struct sockaddr*>(&bound_addr), + bound_addrlen), + SyscallSucceeds()); + + // Get the addresses the socket is bound to because the port is chosen by the + // stack. + ASSERT_THAT(getsockname(bound_s.get(), + reinterpret_cast<struct sockaddr*>(&bound_addr), + &bound_addrlen), + SyscallSucceeds()); + + // Create, initialize, and bind the socket that is used to test connecting to + // the non-listening port. + FileDescriptor client_s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + // Initialize client address to the loopback one. + sockaddr_storage client_addr = + ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); + socklen_t client_addrlen = sizeof(client_addr); + + ASSERT_THAT( + bind(client_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr), + client_addrlen), + SyscallSucceeds()); + + ASSERT_THAT(getsockname(client_s.get(), + reinterpret_cast<struct sockaddr*>(&client_addr), + &client_addrlen), + SyscallSucceeds()); + + // Now the test: connect to the bound but not listening socket with the + // client socket. The bound socket should return a RST and cause the client + // socket to return an error and clean itself up immediately. + // The error being ECONNREFUSED diverges with RFC 793, page 37, but does what + // Linux does. + ASSERT_THAT(connect(client_s.get(), + reinterpret_cast<const struct sockaddr*>(&bound_addr), + bound_addrlen), + SyscallFailsWithErrno(ECONNREFUSED)); + + FileDescriptor new_s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + // Test binding to the address from the client socket. This should be okay + // if it was dropped correctly. + ASSERT_THAT( + bind(new_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr), + client_addrlen), + SyscallSucceeds()); + + // Attempt #2, with the new socket and reused addr our connect should fail in + // the same way as before, not with an EADDRINUSE. + ASSERT_THAT(connect(client_s.get(), + reinterpret_cast<const struct sockaddr*>(&bound_addr), + bound_addrlen), + SyscallFailsWithErrno(ECONNREFUSED)); +} + // Test that we get an ECONNREFUSED with a nonblocking socket. TEST_P(SimpleTcpSocketTest, NonBlockingConnectRefused) { FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( @@ -1175,6 +1247,31 @@ TEST_P(SimpleTcpSocketTest, SetMaxSegFailsForInvalidMSSValues) { } } +TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) { + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + { + constexpr int kTCPUserTimeout = -1; + EXPECT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, + &kTCPUserTimeout, sizeof(kTCPUserTimeout)), + SyscallFailsWithErrno(EINVAL)); + } + + // kTCPUserTimeout is in milliseconds. + constexpr int kTCPUserTimeout = 100; + ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, + &kTCPUserTimeout, sizeof(kTCPUserTimeout)), + SyscallSucceedsWithValue(0)); + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT( + getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, kTCPUserTimeout); +} + INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest, ::testing::Values(AF_INET, AF_INET6)); diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc index 111dbacdf..7a8ac30a4 100644 --- a/test/syscalls/linux/udp_socket.cc +++ b/test/syscalls/linux/udp_socket.cc @@ -12,1332 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include <arpa/inet.h> -#include <fcntl.h> -#include <linux/errqueue.h> -#include <netinet/in.h> -#include <sys/ioctl.h> -#include <sys/socket.h> -#include <sys/types.h> - -#include "gtest/gtest.h" -#include "absl/base/macros.h" -#include "absl/time/clock.h" -#include "absl/time/time.h" -#include "test/syscalls/linux/socket_test_util.h" -#include "test/syscalls/linux/unix_domain_socket_test_util.h" -#include "test/util/test_util.h" -#include "test/util/thread_util.h" +#include "test/syscalls/linux/udp_socket_test_cases.h" namespace gvisor { namespace testing { namespace { -// The initial port to be be used on gvisor. -constexpr int TestPort = 40000; - -// Fixture for tests parameterized by the address family to use (AF_INET and -// AF_INET6) when creating sockets. -class UdpSocketTest : public ::testing::TestWithParam<AddressFamily> { - protected: - // Creates two sockets that will be used by test cases. - void SetUp() override; - - // Closes the sockets created by SetUp(). - void TearDown() override { - EXPECT_THAT(close(s_), SyscallSucceeds()); - EXPECT_THAT(close(t_), SyscallSucceeds()); - - for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) { - ASSERT_NO_ERRNO(FreeAvailablePort(ports_[i])); - } - } - - // First UDP socket. - int s_; - - // Second UDP socket. - int t_; - - // The length of the socket address. - socklen_t addrlen_; - - // Initialized address pointing to loopback and port TestPort+i. - struct sockaddr* addr_[3]; - - // Initialize "any" address. - struct sockaddr* anyaddr_; - - // Used ports. - int ports_[3]; - - private: - // Storage for the loopback addresses. - struct sockaddr_storage addr_storage_[3]; - - // Storage for the "any" address. - struct sockaddr_storage anyaddr_storage_; -}; - -// Gets a pointer to the port component of the given address. -uint16_t* Port(struct sockaddr_storage* addr) { - switch (addr->ss_family) { - case AF_INET: { - auto sin = reinterpret_cast<struct sockaddr_in*>(addr); - return &sin->sin_port; - } - case AF_INET6: { - auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr); - return &sin6->sin6_port; - } - } - - return nullptr; -} - -void UdpSocketTest::SetUp() { - int type; - if (GetParam() == AddressFamily::kIpv4) { - type = AF_INET; - auto sin = reinterpret_cast<struct sockaddr_in*>(&anyaddr_storage_); - addrlen_ = sizeof(*sin); - sin->sin_addr.s_addr = htonl(INADDR_ANY); - } else { - type = AF_INET6; - auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&anyaddr_storage_); - addrlen_ = sizeof(*sin6); - if (GetParam() == AddressFamily::kIpv6) { - sin6->sin6_addr = IN6ADDR_ANY_INIT; - } else { - TestAddress const& v4_mapped_any = V4MappedAny(); - sin6->sin6_addr = - reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr) - ->sin6_addr; - } - } - ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds()); - - ASSERT_THAT(t_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds()); - - memset(&anyaddr_storage_, 0, sizeof(anyaddr_storage_)); - anyaddr_ = reinterpret_cast<struct sockaddr*>(&anyaddr_storage_); - anyaddr_->sa_family = type; - - if (gvisor::testing::IsRunningOnGvisor()) { - for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) { - ports_[i] = TestPort + i; - } - } else { - // When not under gvisor, use utility function to pick port. Assert that - // all ports are different. - std::string error; - for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) { - // Find an unused port, we specify port 0 to allow the kernel to provide - // the port. - bool unique = true; - do { - ports_[i] = ASSERT_NO_ERRNO_AND_VALUE(PortAvailable( - 0, AddressFamily::kDualStack, SocketType::kUdp, false)); - ASSERT_GT(ports_[i], 0); - for (size_t j = 0; j < i; ++j) { - if (ports_[j] == ports_[i]) { - unique = false; - break; - } - } - } while (!unique); - } - } - - // Initialize the sockaddrs. - for (size_t i = 0; i < ABSL_ARRAYSIZE(addr_); ++i) { - memset(&addr_storage_[i], 0, sizeof(addr_storage_[i])); - - addr_[i] = reinterpret_cast<struct sockaddr*>(&addr_storage_[i]); - addr_[i]->sa_family = type; - - switch (type) { - case AF_INET: { - auto sin = reinterpret_cast<struct sockaddr_in*>(addr_[i]); - sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - sin->sin_port = htons(ports_[i]); - break; - } - case AF_INET6: { - auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr_[i]); - sin6->sin6_addr = in6addr_loopback; - sin6->sin6_port = htons(ports_[i]); - break; - } - } - } -} - -TEST_P(UdpSocketTest, Creation) { - int type = AF_INET6; - if (GetParam() == AddressFamily::kIpv4) { - type = AF_INET; - } - - int s_; - - ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds()); - EXPECT_THAT(close(s_), SyscallSucceeds()); - - ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, 0), SyscallSucceeds()); - EXPECT_THAT(close(s_), SyscallSucceeds()); - - ASSERT_THAT(s_ = socket(type, SOCK_STREAM, IPPROTO_UDP), SyscallFails()); -} - -TEST_P(UdpSocketTest, Getsockname) { - // Check that we're not bound. - struct sockaddr_storage addr; - socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - EXPECT_EQ(addrlen, addrlen_); - EXPECT_EQ(memcmp(&addr, anyaddr_, addrlen_), 0); - - // Bind, then check that we get the right address. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - EXPECT_EQ(addrlen, addrlen_); - EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0); -} - -TEST_P(UdpSocketTest, Getpeername) { - // Check that we're not connected. - struct sockaddr_storage addr; - socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallFailsWithErrno(ENOTCONN)); - - // Connect, then check that we get the right address. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - - addrlen = sizeof(addr); - EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - EXPECT_EQ(addrlen, addrlen_); - EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0); -} - -TEST_P(UdpSocketTest, SendNotConnected) { - // Do send & write, they must fail. - char buf[512]; - EXPECT_THAT(send(s_, buf, sizeof(buf), 0), - SyscallFailsWithErrno(EDESTADDRREQ)); - - EXPECT_THAT(write(s_, buf, sizeof(buf)), SyscallFailsWithErrno(EDESTADDRREQ)); - - // Use sendto. - ASSERT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[0], addrlen_), - SyscallSucceedsWithValue(sizeof(buf))); - - // Check that we're bound now. - struct sockaddr_storage addr; - socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - EXPECT_EQ(addrlen, addrlen_); - EXPECT_NE(*Port(&addr), 0); -} - -TEST_P(UdpSocketTest, ConnectBinds) { - // Connect the socket. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Check that we're bound now. - struct sockaddr_storage addr; - socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - EXPECT_EQ(addrlen, addrlen_); - EXPECT_NE(*Port(&addr), 0); -} - -TEST_P(UdpSocketTest, ReceiveNotBound) { - char buf[512]; - EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); -} - -TEST_P(UdpSocketTest, Bind) { - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Try to bind again. - EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL)); - - // Check that we're still bound to the original address. - struct sockaddr_storage addr; - socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - EXPECT_EQ(addrlen, addrlen_); - EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0); -} - -TEST_P(UdpSocketTest, BindInUse) { - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Try to bind again. - EXPECT_THAT(bind(t_, addr_[0], addrlen_), SyscallFailsWithErrno(EADDRINUSE)); -} - -TEST_P(UdpSocketTest, ReceiveAfterConnect) { - // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds()); - - // Get the address s_ was bound to during connect. - struct sockaddr_storage addr; - socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - EXPECT_EQ(addrlen, addrlen_); - - // Send from t_ to s_. - char buf[512]; - RandomizeBuffer(buf, sizeof(buf)); - ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, - reinterpret_cast<sockaddr*>(&addr), addrlen), - SyscallSucceedsWithValue(sizeof(buf))); - - // Receive the data. - char received[sizeof(buf)]; - EXPECT_THAT(recv(s_, received, sizeof(received), 0), - SyscallSucceedsWithValue(sizeof(received))); - EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); -} - -TEST_P(UdpSocketTest, ReceiveAfterDisconnect) { - // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds()); - - // Get the address s_ was bound to during connect. - struct sockaddr_storage addr; - socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - EXPECT_EQ(addrlen, addrlen_); - - for (int i = 0; i < 2; i++) { - // Send from t_ to s_. - char buf[512]; - RandomizeBuffer(buf, sizeof(buf)); - EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, - reinterpret_cast<sockaddr*>(&addr), addrlen), - SyscallSucceedsWithValue(sizeof(buf))); - - // Receive the data. - char received[sizeof(buf)]; - EXPECT_THAT(recv(s_, received, sizeof(received), 0), - SyscallSucceedsWithValue(sizeof(received))); - EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); - - // Disconnect s_. - struct sockaddr addr = {}; - addr.sa_family = AF_UNSPEC; - ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), SyscallSucceeds()); - // Connect s_ loopback:TestPort. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - } -} - -TEST_P(UdpSocketTest, Connect) { - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Check that we're connected to the right peer. - struct sockaddr_storage peer; - socklen_t peerlen = sizeof(peer); - EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen), - SyscallSucceeds()); - EXPECT_EQ(peerlen, addrlen_); - EXPECT_EQ(memcmp(&peer, addr_[0], addrlen_), 0); - - // Try to bind after connect. - EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL)); - - // Try to connect again. - EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds()); - - // Check that peer name changed. - peerlen = sizeof(peer); - EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen), - SyscallSucceeds()); - EXPECT_EQ(peerlen, addrlen_); - EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0); -} - -void ConnectAny(AddressFamily family, int sockfd, uint16_t port) { - struct sockaddr_storage addr = {}; - - // Precondition check. - { - socklen_t addrlen = sizeof(addr); - EXPECT_THAT( - getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - - if (family == AddressFamily::kIpv4) { - auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr); - EXPECT_EQ(addrlen, sizeof(*addr_out)); - EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY)); - } else { - auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr); - EXPECT_EQ(addrlen, sizeof(*addr_out)); - struct in6_addr any = IN6ADDR_ANY_INIT; - EXPECT_EQ(memcmp(&addr_out->sin6_addr, &any, sizeof(in6_addr)), 0); - } - - { - socklen_t addrlen = sizeof(addr); - EXPECT_THAT( - getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallFailsWithErrno(ENOTCONN)); - } - - struct sockaddr_storage baddr = {}; - if (family == AddressFamily::kIpv4) { - auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr); - addrlen = sizeof(*addr_in); - addr_in->sin_family = AF_INET; - addr_in->sin_addr.s_addr = htonl(INADDR_ANY); - addr_in->sin_port = port; - } else { - auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr); - addrlen = sizeof(*addr_in); - addr_in->sin6_family = AF_INET6; - addr_in->sin6_port = port; - if (family == AddressFamily::kIpv6) { - addr_in->sin6_addr = IN6ADDR_ANY_INIT; - } else { - TestAddress const& v4_mapped_any = V4MappedAny(); - addr_in->sin6_addr = - reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr) - ->sin6_addr; - } - } - - // TODO(b/138658473): gVisor doesn't allow connecting to the zero port. - if (port == 0) { - SKIP_IF(IsRunningOnGvisor()); - } - - ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen), - SyscallSucceeds()); - } - - // Postcondition check. - { - socklen_t addrlen = sizeof(addr); - EXPECT_THAT( - getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - - if (family == AddressFamily::kIpv4) { - auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr); - EXPECT_EQ(addrlen, sizeof(*addr_out)); - EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK)); - } else { - auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr); - EXPECT_EQ(addrlen, sizeof(*addr_out)); - struct in6_addr loopback; - if (family == AddressFamily::kIpv6) { - loopback = IN6ADDR_LOOPBACK_INIT; - } else { - TestAddress const& v4_mapped_loopback = V4MappedLoopback(); - loopback = reinterpret_cast<const struct sockaddr_in6*>( - &v4_mapped_loopback.addr) - ->sin6_addr; - } - - EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0); - } - - addrlen = sizeof(addr); - if (port == 0) { - EXPECT_THAT( - getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallFailsWithErrno(ENOTCONN)); - } else { - EXPECT_THAT( - getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - } - } -} - -TEST_P(UdpSocketTest, ConnectAny) { ConnectAny(GetParam(), s_, 0); } - -TEST_P(UdpSocketTest, ConnectAnyWithPort) { - auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1])); - ConnectAny(GetParam(), s_, port); -} - -void DisconnectAfterConnectAny(AddressFamily family, int sockfd, int port) { - struct sockaddr_storage addr = {}; - - socklen_t addrlen = sizeof(addr); - struct sockaddr_storage baddr = {}; - if (family == AddressFamily::kIpv4) { - auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr); - addrlen = sizeof(*addr_in); - addr_in->sin_family = AF_INET; - addr_in->sin_addr.s_addr = htonl(INADDR_ANY); - addr_in->sin_port = port; - } else { - auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr); - addrlen = sizeof(*addr_in); - addr_in->sin6_family = AF_INET6; - addr_in->sin6_port = port; - if (family == AddressFamily::kIpv6) { - addr_in->sin6_addr = IN6ADDR_ANY_INIT; - } else { - TestAddress const& v4_mapped_any = V4MappedAny(); - addr_in->sin6_addr = - reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr) - ->sin6_addr; - } - } - - // TODO(b/138658473): gVisor doesn't allow connecting to the zero port. - if (port == 0) { - SKIP_IF(IsRunningOnGvisor()); - } - - ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen), - SyscallSucceeds()); - // Now the socket is bound to the loopback address. - - // Disconnect - addrlen = sizeof(addr); - addr.ss_family = AF_UNSPEC; - ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&addr), addrlen), - SyscallSucceeds()); - - // Check that after disconnect the socket is bound to the ANY address. - EXPECT_THAT(getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - if (family == AddressFamily::kIpv4) { - auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr); - EXPECT_EQ(addrlen, sizeof(*addr_out)); - EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY)); - } else { - auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr); - EXPECT_EQ(addrlen, sizeof(*addr_out)); - struct in6_addr loopback = IN6ADDR_ANY_INIT; - - EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0); - } -} - -TEST_P(UdpSocketTest, DisconnectAfterConnectAny) { - DisconnectAfterConnectAny(GetParam(), s_, 0); -} - -TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) { - auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1])); - DisconnectAfterConnectAny(GetParam(), s_, port); -} - -TEST_P(UdpSocketTest, DisconnectAfterBind) { - ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds()); - // Connect the socket. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - - struct sockaddr_storage addr = {}; - addr.ss_family = AF_UNSPEC; - EXPECT_THAT( - connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)), - SyscallSucceeds()); - - // Check that we're still bound. - socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - - EXPECT_EQ(addrlen, addrlen_); - EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0); - - addrlen = sizeof(addr); - EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallFailsWithErrno(ENOTCONN)); -} - -TEST_P(UdpSocketTest, DisconnectAfterBindToAny) { - struct sockaddr_storage baddr = {}; - socklen_t addrlen; - auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1])); - if (GetParam() == AddressFamily::kIpv4) { - auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr); - addr_in->sin_family = AF_INET; - addr_in->sin_port = port; - addr_in->sin_addr.s_addr = htonl(INADDR_ANY); - } else { - auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr); - addr_in->sin6_family = AF_INET6; - addr_in->sin6_port = port; - addr_in->sin6_scope_id = 0; - addr_in->sin6_addr = IN6ADDR_ANY_INIT; - } - ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_), - SyscallSucceeds()); - // Connect the socket. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - - struct sockaddr_storage addr = {}; - addr.ss_family = AF_UNSPEC; - EXPECT_THAT( - connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)), - SyscallSucceeds()); - - // Check that we're still bound. - addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - - EXPECT_EQ(addrlen, addrlen_); - EXPECT_EQ(memcmp(&addr, &baddr, addrlen), 0); - - addrlen = sizeof(addr); - EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallFailsWithErrno(ENOTCONN)); -} - -TEST_P(UdpSocketTest, Disconnect) { - for (int i = 0; i < 2; i++) { - // Try to connect again. - EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds()); - - // Check that we're connected to the right peer. - struct sockaddr_storage peer; - socklen_t peerlen = sizeof(peer); - EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen), - SyscallSucceeds()); - EXPECT_EQ(peerlen, addrlen_); - EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0); - - // Try to disconnect. - struct sockaddr_storage addr = {}; - addr.ss_family = AF_UNSPEC; - EXPECT_THAT( - connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)), - SyscallSucceeds()); - - peerlen = sizeof(peer); - EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen), - SyscallFailsWithErrno(ENOTCONN)); - - // Check that we're still bound. - socklen_t addrlen = sizeof(addr); - EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceeds()); - EXPECT_EQ(addrlen, addrlen_); - EXPECT_EQ(*Port(&addr), 0); - } -} - -TEST_P(UdpSocketTest, ConnectBadAddress) { - struct sockaddr addr = {}; - addr.sa_family = addr_[0]->sa_family; - ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), - SyscallFailsWithErrno(EINVAL)); -} - -TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) { - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Send to a different destination than we're connected to. - char buf[512]; - EXPECT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[1], addrlen_), - SyscallSucceedsWithValue(sizeof(buf))); -} - -TEST_P(UdpSocketTest, ZerolengthWriteAllowed) { - // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); - - // Bind t_ to loopback:TestPort+1. - ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds()); - - char buf[3]; - // Send zero length packet from s_ to t_. - ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0)); - // Receive the packet. - char received[3]; - EXPECT_THAT(read(t_, received, sizeof(received)), - SyscallSucceedsWithValue(0)); -} - -TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) { - // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); - - // Bind t_ to loopback:TestPort+1. - ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds()); - - // Set t_ to non-blocking. - int opts = 0; - ASSERT_THAT(opts = fcntl(t_, F_GETFL), SyscallSucceeds()); - ASSERT_THAT(fcntl(t_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds()); - - char buf[3]; - // Send zero length packet from s_ to t_. - ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0)); - // Receive the packet. - char received[3]; - EXPECT_THAT(read(t_, received, sizeof(received)), - SyscallSucceedsWithValue(0)); - EXPECT_THAT(read(t_, received, sizeof(received)), - SyscallFailsWithErrno(EAGAIN)); -} - -TEST_P(UdpSocketTest, SendAndReceiveNotConnected) { - // Bind s_ to loopback. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Send some data to s_. - char buf[512]; - RandomizeBuffer(buf, sizeof(buf)); - - ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), - SyscallSucceedsWithValue(sizeof(buf))); - - // Receive the data. - char received[sizeof(buf)]; - EXPECT_THAT(recv(s_, received, sizeof(received), 0), - SyscallSucceedsWithValue(sizeof(received))); - EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); -} - -TEST_P(UdpSocketTest, SendAndReceiveConnected) { - // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); - - // Bind t_ to loopback:TestPort+1. - ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds()); - - // Send some data from t_ to s_. - char buf[512]; - RandomizeBuffer(buf, sizeof(buf)); - - ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), - SyscallSucceedsWithValue(sizeof(buf))); - - // Receive the data. - char received[sizeof(buf)]; - EXPECT_THAT(recv(s_, received, sizeof(received), 0), - SyscallSucceedsWithValue(sizeof(received))); - EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); -} - -TEST_P(UdpSocketTest, ReceiveFromNotConnected) { - // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); - - // Bind t_ to loopback:TestPort+2. - ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds()); - - // Send some data from t_ to s_. - char buf[512]; - ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), - SyscallSucceedsWithValue(sizeof(buf))); - - // Check that the data isn't_ received because it was sent from a different - // address than we're connected. - EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); -} - -TEST_P(UdpSocketTest, ReceiveBeforeConnect) { - // Bind s_ to loopback:TestPort. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Bind t_ to loopback:TestPort+2. - ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds()); - - // Send some data from t_ to s_. - char buf[512]; - RandomizeBuffer(buf, sizeof(buf)); - - ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), - SyscallSucceedsWithValue(sizeof(buf))); - - // Connect to loopback:TestPort+1. - ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); - - // Receive the data. It works because it was sent before the connect. - char received[sizeof(buf)]; - EXPECT_THAT(recv(s_, received, sizeof(received), 0), - SyscallSucceedsWithValue(sizeof(received))); - EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); - - // Send again. This time it should not be received. - ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), - SyscallSucceedsWithValue(sizeof(buf))); - - EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); -} - -TEST_P(UdpSocketTest, ReceiveFrom) { - // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); - - // Bind t_ to loopback:TestPort+1. - ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds()); - - // Send some data from t_ to s_. - char buf[512]; - RandomizeBuffer(buf, sizeof(buf)); - - ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), - SyscallSucceedsWithValue(sizeof(buf))); - - // Receive the data and sender address. - char received[sizeof(buf)]; - struct sockaddr_storage addr; - socklen_t addrlen = sizeof(addr); - EXPECT_THAT(recvfrom(s_, received, sizeof(received), 0, - reinterpret_cast<sockaddr*>(&addr), &addrlen), - SyscallSucceedsWithValue(sizeof(received))); - EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); - EXPECT_EQ(addrlen, addrlen_); - EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0); -} - -TEST_P(UdpSocketTest, Listen) { - ASSERT_THAT(listen(s_, SOMAXCONN), SyscallFailsWithErrno(EOPNOTSUPP)); -} - -TEST_P(UdpSocketTest, Accept) { - ASSERT_THAT(accept(s_, nullptr, nullptr), SyscallFailsWithErrno(EOPNOTSUPP)); -} - -// This test validates that a read shutdown with pending data allows the read -// to proceed with the data before returning EAGAIN. -TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) { - char received[512]; - - // Bind t_ to loopback:TestPort+2. - ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds()); - - // Connect the socket, then try to shutdown again. - ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds()); - - // Verify that we get EWOULDBLOCK when there is nothing to read. - EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); - - const char* buf = "abc"; - EXPECT_THAT(write(t_, buf, 3), SyscallSucceedsWithValue(3)); - - int opts = 0; - ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds()); - ASSERT_THAT(fcntl(s_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds()); - ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds()); - ASSERT_NE(opts & O_NONBLOCK, 0); - - EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); - - // We should get the data even though read has been shutdown. - EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2)); - - // Because we read less than the entire packet length, since it's a packet - // based socket any subsequent reads should return EWOULDBLOCK. - EXPECT_THAT(recv(s_, received, 1, 0), SyscallFailsWithErrno(EWOULDBLOCK)); -} - -// This test is validating that even after a socket is shutdown if it's -// reconnected it will reset the shutdown state. -TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) { - char received[512]; - EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); - - EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN)); - - EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); - - // Connect the socket, then try to shutdown again. - ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds()); - - EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); -} - -TEST_P(UdpSocketTest, ReadShutdown) { - char received[512]; - EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); - - EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN)); - - EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); - - // Connect the socket, then try to shutdown again. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - - EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); - - EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); - - EXPECT_THAT(recv(s_, received, sizeof(received), 0), - SyscallSucceedsWithValue(0)); -} - -TEST_P(UdpSocketTest, ReadShutdownDifferentThread) { - char received[512]; - EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); - - // Connect the socket, then shutdown from another thread. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - - EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); - - ScopedThread t([&] { - absl::SleepFor(absl::Milliseconds(200)); - EXPECT_THAT(shutdown(this->s_, SHUT_RD), SyscallSucceeds()); - }); - EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0), - SyscallSucceedsWithValue(0)); - t.Join(); - - EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0), - SyscallSucceedsWithValue(0)); -} - -TEST_P(UdpSocketTest, WriteShutdown) { - EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN)); - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds()); -} - -TEST_P(UdpSocketTest, SynchronousReceive) { - // Bind s_ to loopback. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Send some data to s_ from another thread. - char buf[512]; - RandomizeBuffer(buf, sizeof(buf)); - - // Receive the data prior to actually starting the other thread. - char received[512]; - EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), MSG_DONTWAIT), - SyscallFailsWithErrno(EWOULDBLOCK)); - - // Start the thread. - ScopedThread t([&] { - absl::SleepFor(absl::Milliseconds(200)); - ASSERT_THAT( - sendto(this->t_, buf, sizeof(buf), 0, this->addr_[0], this->addrlen_), - SyscallSucceedsWithValue(sizeof(buf))); - }); - - EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0), - SyscallSucceedsWithValue(512)); - EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); -} - -TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) { - // Bind s_ to loopback:TestPort. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Send 3 packets from t_ to s_. - constexpr int psize = 100; - char buf[3 * psize]; - RandomizeBuffer(buf, sizeof(buf)); - - for (int i = 0; i < 3; ++i) { - ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_), - SyscallSucceedsWithValue(psize)); - } - - // Receive the data as 3 separate packets. - char received[6 * psize]; - for (int i = 0; i < 3; ++i) { - EXPECT_THAT(recv(s_, received + i * psize, 3 * psize, 0), - SyscallSucceedsWithValue(psize)); - } - EXPECT_EQ(memcmp(buf, received, 3 * psize), 0); -} - -TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) { - // Bind s_ to loopback:TestPort. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Direct writes from t_ to s_. - ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); - - // Send 2 packets from t_ to s_, where each packet's data consists of 2 - // discontiguous iovecs. - constexpr size_t kPieceSize = 100; - char buf[4 * kPieceSize]; - RandomizeBuffer(buf, sizeof(buf)); - - for (int i = 0; i < 2; i++) { - struct iovec iov[2]; - for (int j = 0; j < 2; j++) { - iov[j].iov_base = reinterpret_cast<void*>( - reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize); - iov[j].iov_len = kPieceSize; - } - ASSERT_THAT(writev(t_, iov, 2), SyscallSucceedsWithValue(2 * kPieceSize)); - } - - // Receive the data as 2 separate packets. - char received[6 * kPieceSize]; - for (int i = 0; i < 2; i++) { - struct iovec iov[3]; - for (int j = 0; j < 3; j++) { - iov[j].iov_base = reinterpret_cast<void*>( - reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize); - iov[j].iov_len = kPieceSize; - } - ASSERT_THAT(readv(s_, iov, 3), SyscallSucceedsWithValue(2 * kPieceSize)); - } - EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0); -} - -TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) { - // Bind s_ to loopback:TestPort. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Send 2 packets from t_ to s_, where each packet's data consists of 2 - // discontiguous iovecs. - constexpr size_t kPieceSize = 100; - char buf[4 * kPieceSize]; - RandomizeBuffer(buf, sizeof(buf)); - - for (int i = 0; i < 2; i++) { - struct iovec iov[2]; - for (int j = 0; j < 2; j++) { - iov[j].iov_base = reinterpret_cast<void*>( - reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize); - iov[j].iov_len = kPieceSize; - } - struct msghdr msg = {}; - msg.msg_name = addr_[0]; - msg.msg_namelen = addrlen_; - msg.msg_iov = iov; - msg.msg_iovlen = 2; - ASSERT_THAT(sendmsg(t_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize)); - } - - // Receive the data as 2 separate packets. - char received[6 * kPieceSize]; - for (int i = 0; i < 2; i++) { - struct iovec iov[3]; - for (int j = 0; j < 3; j++) { - iov[j].iov_base = reinterpret_cast<void*>( - reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize); - iov[j].iov_len = kPieceSize; - } - struct msghdr msg = {}; - msg.msg_iov = iov; - msg.msg_iovlen = 3; - ASSERT_THAT(recvmsg(s_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize)); - } - EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0); -} - -TEST_P(UdpSocketTest, FIONREADShutdown) { - int n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); - - // A UDP socket must be connected before it can be shutdown. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - - n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); - - EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); - - n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); -} - -TEST_P(UdpSocketTest, FIONREADWriteShutdown) { - int n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); - - // Bind s_ to loopback:TestPort. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // A UDP socket must be connected before it can be shutdown. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - - n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); - - const char str[] = "abc"; - ASSERT_THAT(send(s_, str, sizeof(str), 0), - SyscallSucceedsWithValue(sizeof(str))); - - n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, sizeof(str)); - - EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); - - n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, sizeof(str)); -} - -TEST_P(UdpSocketTest, FIONREAD) { - // Bind s_ to loopback:TestPort. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Check that the bound socket with an empty buffer reports an empty first - // packet. - int n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); - - // Send 3 packets from t_ to s_. - constexpr int psize = 100; - char buf[3 * psize]; - RandomizeBuffer(buf, sizeof(buf)); - - for (int i = 0; i < 3; ++i) { - ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_), - SyscallSucceedsWithValue(psize)); - - // Check that regardless of how many packets are in the queue, the size - // reported is that of a single packet. - n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, psize); - } -} - -TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) { - // Bind s_ to loopback:TestPort. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // Check that the bound socket with an empty buffer reports an empty first - // packet. - int n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); - - // Send 3 packets from t_ to s_. - constexpr int psize = 100; - char buf[3 * psize]; - RandomizeBuffer(buf, sizeof(buf)); - - for (int i = 0; i < 3; ++i) { - ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_), - SyscallSucceedsWithValue(0)); - - // Check that regardless of how many packets are in the queue, the size - // reported is that of a single packet. - n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); - } -} - -TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) { - int n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); - - // Bind s_ to loopback:TestPort. - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - - // A UDP socket must be connected before it can be shutdown. - ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); - - n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); - - const char str[] = "abc"; - ASSERT_THAT(send(s_, str, 0, 0), SyscallSucceedsWithValue(0)); - - n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); - - EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); - - n = -1; - EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); - EXPECT_EQ(n, 0); -} - -TEST_P(UdpSocketTest, ErrorQueue) { - char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))]; - msghdr msg; - memset(&msg, 0, sizeof(msg)); - iovec iov; - memset(&iov, 0, sizeof(iov)); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_control = cmsgbuf; - msg.msg_controllen = sizeof(cmsgbuf); - - // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT. - EXPECT_THAT(RetryEINTR(recvmsg)(s_, &msg, MSG_ERRQUEUE), - SyscallFailsWithErrno(EAGAIN)); -} - -TEST_P(UdpSocketTest, SoTimestampOffByDefault) { - int v = -1; - socklen_t optlen = sizeof(v); - ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen), - SyscallSucceeds()); - ASSERT_EQ(v, kSockOptOff); - ASSERT_EQ(optlen, sizeof(v)); -} - -TEST_P(UdpSocketTest, SoTimestamp) { - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); - - int v = 1; - ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)), - SyscallSucceeds()); - - char buf[3]; - // Send zero length packet from t_ to s_. - ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0)); - - char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))]; - msghdr msg; - memset(&msg, 0, sizeof(msg)); - iovec iov; - memset(&iov, 0, sizeof(iov)); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_control = cmsgbuf; - msg.msg_controllen = sizeof(cmsgbuf); - - ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0)); - - struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg); - ASSERT_NE(cmsg, nullptr); - ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET); - ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP); - ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval))); - - struct timeval tv = {}; - memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval)); - - ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0); - - // There should be nothing to get via ioctl. - ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT)); -} - -TEST_P(UdpSocketTest, WriteShutdownNotConnected) { - EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN)); -} - -TEST_P(UdpSocketTest, TimestampIoctl) { - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); - - char buf[3]; - // Send packet from t_ to s_. - ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)), - SyscallSucceedsWithValue(sizeof(buf))); - - // There should be no control messages. - char recv_buf[sizeof(buf)]; - ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf))); - - // A nonzero timeval should be available via ioctl. - struct timeval tv = {}; - ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds()); - ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0); -} - -TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) { - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); - - struct timeval tv = {}; - ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT)); -} - -// Test that the timestamp accessed via SIOCGSTAMP is still accessible after -// SO_TIMESTAMP is enabled and used to retrieve a timestamp. -TEST_P(UdpSocketTest, TimestampIoctlPersistence) { - ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); - ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); - - char buf[3]; - // Send packet from t_ to s_. - ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)), - SyscallSucceedsWithValue(sizeof(buf))); - ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0)); - - // There should be no control messages. - char recv_buf[sizeof(buf)]; - ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf))); - - // A nonzero timeval should be available via ioctl. - struct timeval tv = {}; - ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds()); - ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0); - - // Enable SO_TIMESTAMP and send a message. - int v = 1; - EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)), - SyscallSucceeds()); - ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0)); - - // There should be a message for SO_TIMESTAMP. - char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))]; - msghdr msg = {}; - iovec iov = {}; - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_control = cmsgbuf; - msg.msg_controllen = sizeof(cmsgbuf); - ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0)); - struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg); - cmsg = CMSG_FIRSTHDR(&msg); - ASSERT_NE(cmsg, nullptr); - - // The ioctl should return the exact same values as before. - struct timeval tv2 = {}; - ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv2), SyscallSucceeds()); - ASSERT_EQ(tv.tv_sec, tv2.tv_sec); - ASSERT_EQ(tv.tv_usec, tv2.tv_usec); -} - INSTANTIATE_TEST_SUITE_P(AllInetTests, UdpSocketTest, ::testing::Values(AddressFamily::kIpv4, AddressFamily::kIpv6, diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc new file mode 100644 index 000000000..147978f46 --- /dev/null +++ b/test/syscalls/linux/udp_socket_errqueue_test_case.cc @@ -0,0 +1,54 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/syscalls/linux/udp_socket_test_cases.h" + +#include <arpa/inet.h> +#include <fcntl.h> +#include <linux/errqueue.h> +#include <netinet/in.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include "gtest/gtest.h" +#include "absl/base/macros.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "test/syscalls/linux/socket_test_util.h" +#include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/test_util.h" +#include "test/util/thread_util.h" + +namespace gvisor { +namespace testing { + +TEST_P(UdpSocketTest, ErrorQueue) { + char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))]; + msghdr msg; + memset(&msg, 0, sizeof(msg)); + iovec iov; + memset(&iov, 0, sizeof(iov)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsgbuf; + msg.msg_controllen = sizeof(cmsgbuf); + + // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT. + EXPECT_THAT(RetryEINTR(recvmsg)(s_, &msg, MSG_ERRQUEUE), + SyscallFailsWithErrno(EAGAIN)); +} + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc new file mode 100644 index 000000000..dc35c2f50 --- /dev/null +++ b/test/syscalls/linux/udp_socket_test_cases.cc @@ -0,0 +1,1498 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/syscalls/linux/udp_socket_test_cases.h" + +#include <arpa/inet.h> +#include <fcntl.h> +#include <netinet/in.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include "gtest/gtest.h" +#include "absl/base/macros.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "test/syscalls/linux/socket_test_util.h" +#include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/test_util.h" +#include "test/util/thread_util.h" + +namespace gvisor { +namespace testing { + +// Gets a pointer to the port component of the given address. +uint16_t* Port(struct sockaddr_storage* addr) { + switch (addr->ss_family) { + case AF_INET: { + auto sin = reinterpret_cast<struct sockaddr_in*>(addr); + return &sin->sin_port; + } + case AF_INET6: { + auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr); + return &sin6->sin6_port; + } + } + + return nullptr; +} + +void UdpSocketTest::SetUp() { + int type; + if (GetParam() == AddressFamily::kIpv4) { + type = AF_INET; + auto sin = reinterpret_cast<struct sockaddr_in*>(&anyaddr_storage_); + addrlen_ = sizeof(*sin); + sin->sin_addr.s_addr = htonl(INADDR_ANY); + } else { + type = AF_INET6; + auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&anyaddr_storage_); + addrlen_ = sizeof(*sin6); + if (GetParam() == AddressFamily::kIpv6) { + sin6->sin6_addr = IN6ADDR_ANY_INIT; + } else { + TestAddress const& v4_mapped_any = V4MappedAny(); + sin6->sin6_addr = + reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr) + ->sin6_addr; + } + } + ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds()); + + ASSERT_THAT(t_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds()); + + memset(&anyaddr_storage_, 0, sizeof(anyaddr_storage_)); + anyaddr_ = reinterpret_cast<struct sockaddr*>(&anyaddr_storage_); + anyaddr_->sa_family = type; + + if (gvisor::testing::IsRunningOnGvisor()) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) { + ports_[i] = TestPort + i; + } + } else { + // When not under gvisor, use utility function to pick port. Assert that + // all ports are different. + std::string error; + for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) { + // Find an unused port, we specify port 0 to allow the kernel to provide + // the port. + bool unique = true; + do { + ports_[i] = ASSERT_NO_ERRNO_AND_VALUE(PortAvailable( + 0, AddressFamily::kDualStack, SocketType::kUdp, false)); + ASSERT_GT(ports_[i], 0); + for (size_t j = 0; j < i; ++j) { + if (ports_[j] == ports_[i]) { + unique = false; + break; + } + } + } while (!unique); + } + } + + // Initialize the sockaddrs. + for (size_t i = 0; i < ABSL_ARRAYSIZE(addr_); ++i) { + memset(&addr_storage_[i], 0, sizeof(addr_storage_[i])); + + addr_[i] = reinterpret_cast<struct sockaddr*>(&addr_storage_[i]); + addr_[i]->sa_family = type; + + switch (type) { + case AF_INET: { + auto sin = reinterpret_cast<struct sockaddr_in*>(addr_[i]); + sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + sin->sin_port = htons(ports_[i]); + break; + } + case AF_INET6: { + auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr_[i]); + sin6->sin6_addr = in6addr_loopback; + sin6->sin6_port = htons(ports_[i]); + break; + } + } + } +} + +TEST_P(UdpSocketTest, Creation) { + int type = AF_INET6; + if (GetParam() == AddressFamily::kIpv4) { + type = AF_INET; + } + + int s_; + + ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds()); + EXPECT_THAT(close(s_), SyscallSucceeds()); + + ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, 0), SyscallSucceeds()); + EXPECT_THAT(close(s_), SyscallSucceeds()); + + ASSERT_THAT(s_ = socket(type, SOCK_STREAM, IPPROTO_UDP), SyscallFails()); +} + +TEST_P(UdpSocketTest, Getsockname) { + // Check that we're not bound. + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_EQ(memcmp(&addr, anyaddr_, addrlen_), 0); + + // Bind, then check that we get the right address. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0); +} + +TEST_P(UdpSocketTest, Getpeername) { + // Check that we're not connected. + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); + + // Connect, then check that we get the right address. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + addrlen = sizeof(addr); + EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0); +} + +TEST_P(UdpSocketTest, SendNotConnected) { + // Do send & write, they must fail. + char buf[512]; + EXPECT_THAT(send(s_, buf, sizeof(buf), 0), + SyscallFailsWithErrno(EDESTADDRREQ)); + + EXPECT_THAT(write(s_, buf, sizeof(buf)), SyscallFailsWithErrno(EDESTADDRREQ)); + + // Use sendto. + ASSERT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[0], addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); + + // Check that we're bound now. + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_NE(*Port(&addr), 0); +} + +TEST_P(UdpSocketTest, ConnectBinds) { + // Connect the socket. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Check that we're bound now. + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_NE(*Port(&addr), 0); +} + +TEST_P(UdpSocketTest, ReceiveNotBound) { + char buf[512]; + EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); +} + +TEST_P(UdpSocketTest, Bind) { + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Try to bind again. + EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL)); + + // Check that we're still bound to the original address. + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0); +} + +TEST_P(UdpSocketTest, BindInUse) { + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Try to bind again. + EXPECT_THAT(bind(t_, addr_[0], addrlen_), SyscallFailsWithErrno(EADDRINUSE)); +} + +TEST_P(UdpSocketTest, ReceiveAfterConnect) { + // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds()); + + // Get the address s_ was bound to during connect. + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + + // Send from t_ to s_. + char buf[512]; + RandomizeBuffer(buf, sizeof(buf)); + ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, + reinterpret_cast<sockaddr*>(&addr), addrlen), + SyscallSucceedsWithValue(sizeof(buf))); + + // Receive the data. + char received[sizeof(buf)]; + EXPECT_THAT(recv(s_, received, sizeof(received), 0), + SyscallSucceedsWithValue(sizeof(received))); + EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); +} + +TEST_P(UdpSocketTest, ReceiveAfterDisconnect) { + // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds()); + + // Get the address s_ was bound to during connect. + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + + for (int i = 0; i < 2; i++) { + // Send from t_ to s_. + char buf[512]; + RandomizeBuffer(buf, sizeof(buf)); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, + reinterpret_cast<sockaddr*>(&addr), addrlen), + SyscallSucceedsWithValue(sizeof(buf))); + + // Receive the data. + char received[sizeof(buf)]; + EXPECT_THAT(recv(s_, received, sizeof(received), 0), + SyscallSucceedsWithValue(sizeof(received))); + EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); + + // Disconnect s_. + struct sockaddr addr = {}; + addr.sa_family = AF_UNSPEC; + ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), SyscallSucceeds()); + // Connect s_ loopback:TestPort. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + } +} + +TEST_P(UdpSocketTest, Connect) { + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Check that we're connected to the right peer. + struct sockaddr_storage peer; + socklen_t peerlen = sizeof(peer); + EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen), + SyscallSucceeds()); + EXPECT_EQ(peerlen, addrlen_); + EXPECT_EQ(memcmp(&peer, addr_[0], addrlen_), 0); + + // Try to bind after connect. + EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL)); + + // Try to connect again. + EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds()); + + // Check that peer name changed. + peerlen = sizeof(peer); + EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen), + SyscallSucceeds()); + EXPECT_EQ(peerlen, addrlen_); + EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0); +} + +void ConnectAny(AddressFamily family, int sockfd, uint16_t port) { + struct sockaddr_storage addr = {}; + + // Precondition check. + { + socklen_t addrlen = sizeof(addr); + EXPECT_THAT( + getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + + if (family == AddressFamily::kIpv4) { + auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr); + EXPECT_EQ(addrlen, sizeof(*addr_out)); + EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY)); + } else { + auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr); + EXPECT_EQ(addrlen, sizeof(*addr_out)); + struct in6_addr any = IN6ADDR_ANY_INIT; + EXPECT_EQ(memcmp(&addr_out->sin6_addr, &any, sizeof(in6_addr)), 0); + } + + { + socklen_t addrlen = sizeof(addr); + EXPECT_THAT( + getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); + } + + struct sockaddr_storage baddr = {}; + if (family == AddressFamily::kIpv4) { + auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr); + addrlen = sizeof(*addr_in); + addr_in->sin_family = AF_INET; + addr_in->sin_addr.s_addr = htonl(INADDR_ANY); + addr_in->sin_port = port; + } else { + auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr); + addrlen = sizeof(*addr_in); + addr_in->sin6_family = AF_INET6; + addr_in->sin6_port = port; + if (family == AddressFamily::kIpv6) { + addr_in->sin6_addr = IN6ADDR_ANY_INIT; + } else { + TestAddress const& v4_mapped_any = V4MappedAny(); + addr_in->sin6_addr = + reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr) + ->sin6_addr; + } + } + + // TODO(b/138658473): gVisor doesn't allow connecting to the zero port. + if (port == 0) { + SKIP_IF(IsRunningOnGvisor()); + } + + ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen), + SyscallSucceeds()); + } + + // Postcondition check. + { + socklen_t addrlen = sizeof(addr); + EXPECT_THAT( + getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + + if (family == AddressFamily::kIpv4) { + auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr); + EXPECT_EQ(addrlen, sizeof(*addr_out)); + EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK)); + } else { + auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr); + EXPECT_EQ(addrlen, sizeof(*addr_out)); + struct in6_addr loopback; + if (family == AddressFamily::kIpv6) { + loopback = IN6ADDR_LOOPBACK_INIT; + } else { + TestAddress const& v4_mapped_loopback = V4MappedLoopback(); + loopback = reinterpret_cast<const struct sockaddr_in6*>( + &v4_mapped_loopback.addr) + ->sin6_addr; + } + + EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0); + } + + addrlen = sizeof(addr); + if (port == 0) { + EXPECT_THAT( + getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); + } else { + EXPECT_THAT( + getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + } + } +} + +TEST_P(UdpSocketTest, ConnectAny) { ConnectAny(GetParam(), s_, 0); } + +TEST_P(UdpSocketTest, ConnectAnyWithPort) { + auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1])); + ConnectAny(GetParam(), s_, port); +} + +void DisconnectAfterConnectAny(AddressFamily family, int sockfd, int port) { + struct sockaddr_storage addr = {}; + + socklen_t addrlen = sizeof(addr); + struct sockaddr_storage baddr = {}; + if (family == AddressFamily::kIpv4) { + auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr); + addrlen = sizeof(*addr_in); + addr_in->sin_family = AF_INET; + addr_in->sin_addr.s_addr = htonl(INADDR_ANY); + addr_in->sin_port = port; + } else { + auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr); + addrlen = sizeof(*addr_in); + addr_in->sin6_family = AF_INET6; + addr_in->sin6_port = port; + if (family == AddressFamily::kIpv6) { + addr_in->sin6_addr = IN6ADDR_ANY_INIT; + } else { + TestAddress const& v4_mapped_any = V4MappedAny(); + addr_in->sin6_addr = + reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr) + ->sin6_addr; + } + } + + // TODO(b/138658473): gVisor doesn't allow connecting to the zero port. + if (port == 0) { + SKIP_IF(IsRunningOnGvisor()); + } + + ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen), + SyscallSucceeds()); + // Now the socket is bound to the loopback address. + + // Disconnect + addrlen = sizeof(addr); + addr.ss_family = AF_UNSPEC; + ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&addr), addrlen), + SyscallSucceeds()); + + // Check that after disconnect the socket is bound to the ANY address. + EXPECT_THAT(getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + if (family == AddressFamily::kIpv4) { + auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr); + EXPECT_EQ(addrlen, sizeof(*addr_out)); + EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY)); + } else { + auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr); + EXPECT_EQ(addrlen, sizeof(*addr_out)); + struct in6_addr loopback = IN6ADDR_ANY_INIT; + + EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0); + } +} + +TEST_P(UdpSocketTest, DisconnectAfterConnectAny) { + DisconnectAfterConnectAny(GetParam(), s_, 0); +} + +TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) { + auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1])); + DisconnectAfterConnectAny(GetParam(), s_, port); +} + +TEST_P(UdpSocketTest, DisconnectAfterBind) { + ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds()); + // Connect the socket. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + struct sockaddr_storage addr = {}; + addr.ss_family = AF_UNSPEC; + EXPECT_THAT( + connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)), + SyscallSucceeds()); + + // Check that we're still bound. + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + + EXPECT_EQ(addrlen, addrlen_); + EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0); + + addrlen = sizeof(addr); + EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); +} + +TEST_P(UdpSocketTest, BindToAnyConnnectToLocalhost) { + struct sockaddr_storage baddr = {}; + auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1])); + if (GetParam() == AddressFamily::kIpv4) { + auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr); + addr_in->sin_family = AF_INET; + addr_in->sin_port = port; + addr_in->sin_addr.s_addr = htonl(INADDR_ANY); + } else { + auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr); + addr_in->sin6_family = AF_INET6; + addr_in->sin6_port = port; + addr_in->sin6_scope_id = 0; + addr_in->sin6_addr = IN6ADDR_ANY_INIT; + } + ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_), + SyscallSucceeds()); + // Connect the socket. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + struct sockaddr_storage addr = {}; + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + + // If the socket is bound to ANY and connected to a loopback address, + // getsockname() has to return the loopback address. + if (GetParam() == AddressFamily::kIpv4) { + auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr); + EXPECT_EQ(addrlen, sizeof(*addr_out)); + EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK)); + } else { + auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr); + struct in6_addr loopback = IN6ADDR_LOOPBACK_INIT; + EXPECT_EQ(addrlen, sizeof(*addr_out)); + EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0); + } +} + +TEST_P(UdpSocketTest, DisconnectAfterBindToAny) { + struct sockaddr_storage baddr = {}; + socklen_t addrlen; + auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1])); + if (GetParam() == AddressFamily::kIpv4) { + auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr); + addr_in->sin_family = AF_INET; + addr_in->sin_port = port; + addr_in->sin_addr.s_addr = htonl(INADDR_ANY); + } else { + auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr); + addr_in->sin6_family = AF_INET6; + addr_in->sin6_port = port; + addr_in->sin6_scope_id = 0; + addr_in->sin6_addr = IN6ADDR_ANY_INIT; + } + ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_), + SyscallSucceeds()); + // Connect the socket. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + struct sockaddr_storage addr = {}; + addr.ss_family = AF_UNSPEC; + EXPECT_THAT( + connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)), + SyscallSucceeds()); + + // Check that we're still bound. + addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + + EXPECT_EQ(addrlen, addrlen_); + EXPECT_EQ(memcmp(&addr, &baddr, addrlen), 0); + + addrlen = sizeof(addr); + EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); +} + +TEST_P(UdpSocketTest, Disconnect) { + for (int i = 0; i < 2; i++) { + // Try to connect again. + EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds()); + + // Check that we're connected to the right peer. + struct sockaddr_storage peer; + socklen_t peerlen = sizeof(peer); + EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen), + SyscallSucceeds()); + EXPECT_EQ(peerlen, addrlen_); + EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0); + + // Try to disconnect. + struct sockaddr_storage addr = {}; + addr.ss_family = AF_UNSPEC; + EXPECT_THAT( + connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)), + SyscallSucceeds()); + + peerlen = sizeof(peer); + EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen), + SyscallFailsWithErrno(ENOTCONN)); + + // Check that we're still bound. + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_EQ(*Port(&addr), 0); + } +} + +TEST_P(UdpSocketTest, ConnectBadAddress) { + struct sockaddr addr = {}; + addr.sa_family = addr_[0]->sa_family; + ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), + SyscallFailsWithErrno(EINVAL)); +} + +TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) { + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Send to a different destination than we're connected to. + char buf[512]; + EXPECT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[1], addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); +} + +TEST_P(UdpSocketTest, ZerolengthWriteAllowed) { + // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes. + SKIP_IF(IsRunningWithHostinet()); + + // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); + + // Bind t_ to loopback:TestPort+1. + ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds()); + + char buf[3]; + // Send zero length packet from s_ to t_. + ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0)); + // Receive the packet. + char received[3]; + EXPECT_THAT(read(t_, received, sizeof(received)), + SyscallSucceedsWithValue(0)); +} + +TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) { + // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes. + SKIP_IF(IsRunningWithHostinet()); + + // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); + + // Bind t_ to loopback:TestPort+1. + ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds()); + + // Set t_ to non-blocking. + int opts = 0; + ASSERT_THAT(opts = fcntl(t_, F_GETFL), SyscallSucceeds()); + ASSERT_THAT(fcntl(t_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds()); + + char buf[3]; + // Send zero length packet from s_ to t_. + ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0)); + // Receive the packet. + char received[3]; + EXPECT_THAT(read(t_, received, sizeof(received)), + SyscallSucceedsWithValue(0)); + EXPECT_THAT(read(t_, received, sizeof(received)), + SyscallFailsWithErrno(EAGAIN)); +} + +TEST_P(UdpSocketTest, SendAndReceiveNotConnected) { + // Bind s_ to loopback. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Send some data to s_. + char buf[512]; + RandomizeBuffer(buf, sizeof(buf)); + + ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); + + // Receive the data. + char received[sizeof(buf)]; + EXPECT_THAT(recv(s_, received, sizeof(received), 0), + SyscallSucceedsWithValue(sizeof(received))); + EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); +} + +TEST_P(UdpSocketTest, SendAndReceiveConnected) { + // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); + + // Bind t_ to loopback:TestPort+1. + ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds()); + + // Send some data from t_ to s_. + char buf[512]; + RandomizeBuffer(buf, sizeof(buf)); + + ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); + + // Receive the data. + char received[sizeof(buf)]; + EXPECT_THAT(recv(s_, received, sizeof(received), 0), + SyscallSucceedsWithValue(sizeof(received))); + EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); +} + +TEST_P(UdpSocketTest, ReceiveFromNotConnected) { + // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); + + // Bind t_ to loopback:TestPort+2. + ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds()); + + // Send some data from t_ to s_. + char buf[512]; + ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); + + // Check that the data isn't_ received because it was sent from a different + // address than we're connected. + EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); +} + +TEST_P(UdpSocketTest, ReceiveBeforeConnect) { + // Bind s_ to loopback:TestPort. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Bind t_ to loopback:TestPort+2. + ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds()); + + // Send some data from t_ to s_. + char buf[512]; + RandomizeBuffer(buf, sizeof(buf)); + + ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); + + // Connect to loopback:TestPort+1. + ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); + + // Receive the data. It works because it was sent before the connect. + char received[sizeof(buf)]; + EXPECT_THAT(recv(s_, received, sizeof(received), 0), + SyscallSucceedsWithValue(sizeof(received))); + EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); + + // Send again. This time it should not be received. + ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); + + EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); +} + +TEST_P(UdpSocketTest, ReceiveFrom) { + // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); + + // Bind t_ to loopback:TestPort+1. + ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds()); + + // Send some data from t_ to s_. + char buf[512]; + RandomizeBuffer(buf, sizeof(buf)); + + ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); + + // Receive the data and sender address. + char received[sizeof(buf)]; + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(recvfrom(s_, received, sizeof(received), 0, + reinterpret_cast<sockaddr*>(&addr), &addrlen), + SyscallSucceedsWithValue(sizeof(received))); + EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0); +} + +TEST_P(UdpSocketTest, Listen) { + ASSERT_THAT(listen(s_, SOMAXCONN), SyscallFailsWithErrno(EOPNOTSUPP)); +} + +TEST_P(UdpSocketTest, Accept) { + ASSERT_THAT(accept(s_, nullptr, nullptr), SyscallFailsWithErrno(EOPNOTSUPP)); +} + +// This test validates that a read shutdown with pending data allows the read +// to proceed with the data before returning EAGAIN. +TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) { + char received[512]; + + // Bind t_ to loopback:TestPort+2. + ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds()); + + // Connect the socket, then try to shutdown again. + ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds()); + + // Verify that we get EWOULDBLOCK when there is nothing to read. + EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); + + const char* buf = "abc"; + EXPECT_THAT(write(t_, buf, 3), SyscallSucceedsWithValue(3)); + + int opts = 0; + ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds()); + ASSERT_THAT(fcntl(s_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds()); + ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds()); + ASSERT_NE(opts & O_NONBLOCK, 0); + + EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); + + // We should get the data even though read has been shutdown. + EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2)); + + // Because we read less than the entire packet length, since it's a packet + // based socket any subsequent reads should return EWOULDBLOCK. + EXPECT_THAT(recv(s_, received, 1, 0), SyscallFailsWithErrno(EWOULDBLOCK)); +} + +// This test is validating that even after a socket is shutdown if it's +// reconnected it will reset the shutdown state. +TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) { + char received[512]; + EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); + + EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN)); + + EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); + + // Connect the socket, then try to shutdown again. + ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds()); + + EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); +} + +TEST_P(UdpSocketTest, ReadShutdown) { + // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without + // MSG_DONTWAIT blocks indefinitely. + SKIP_IF(IsRunningWithHostinet()); + + char received[512]; + EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); + + EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN)); + + EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); + + // Connect the socket, then try to shutdown again. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); + + EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); + + EXPECT_THAT(recv(s_, received, sizeof(received), 0), + SyscallSucceedsWithValue(0)); +} + +TEST_P(UdpSocketTest, ReadShutdownDifferentThread) { + // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without + // MSG_DONTWAIT blocks indefinitely. + SKIP_IF(IsRunningWithHostinet()); + + char received[512]; + EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); + + // Connect the socket, then shutdown from another thread. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); + + ScopedThread t([&] { + absl::SleepFor(absl::Milliseconds(200)); + EXPECT_THAT(shutdown(this->s_, SHUT_RD), SyscallSucceeds()); + }); + EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0), + SyscallSucceedsWithValue(0)); + t.Join(); + + EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0), + SyscallSucceedsWithValue(0)); +} + +TEST_P(UdpSocketTest, WriteShutdown) { + EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN)); + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds()); +} + +TEST_P(UdpSocketTest, SynchronousReceive) { + // Bind s_ to loopback. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Send some data to s_ from another thread. + char buf[512]; + RandomizeBuffer(buf, sizeof(buf)); + + // Receive the data prior to actually starting the other thread. + char received[512]; + EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), MSG_DONTWAIT), + SyscallFailsWithErrno(EWOULDBLOCK)); + + // Start the thread. + ScopedThread t([&] { + absl::SleepFor(absl::Milliseconds(200)); + ASSERT_THAT( + sendto(this->t_, buf, sizeof(buf), 0, this->addr_[0], this->addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); + }); + + EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0), + SyscallSucceedsWithValue(512)); + EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); +} + +TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) { + // Bind s_ to loopback:TestPort. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Send 3 packets from t_ to s_. + constexpr int psize = 100; + char buf[3 * psize]; + RandomizeBuffer(buf, sizeof(buf)); + + for (int i = 0; i < 3; ++i) { + ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_), + SyscallSucceedsWithValue(psize)); + } + + // Receive the data as 3 separate packets. + char received[6 * psize]; + for (int i = 0; i < 3; ++i) { + EXPECT_THAT(recv(s_, received + i * psize, 3 * psize, 0), + SyscallSucceedsWithValue(psize)); + } + EXPECT_EQ(memcmp(buf, received, 3 * psize), 0); +} + +TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) { + // Bind s_ to loopback:TestPort. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Direct writes from t_ to s_. + ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); + + // Send 2 packets from t_ to s_, where each packet's data consists of 2 + // discontiguous iovecs. + constexpr size_t kPieceSize = 100; + char buf[4 * kPieceSize]; + RandomizeBuffer(buf, sizeof(buf)); + + for (int i = 0; i < 2; i++) { + struct iovec iov[2]; + for (int j = 0; j < 2; j++) { + iov[j].iov_base = reinterpret_cast<void*>( + reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize); + iov[j].iov_len = kPieceSize; + } + ASSERT_THAT(writev(t_, iov, 2), SyscallSucceedsWithValue(2 * kPieceSize)); + } + + // Receive the data as 2 separate packets. + char received[6 * kPieceSize]; + for (int i = 0; i < 2; i++) { + struct iovec iov[3]; + for (int j = 0; j < 3; j++) { + iov[j].iov_base = reinterpret_cast<void*>( + reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize); + iov[j].iov_len = kPieceSize; + } + ASSERT_THAT(readv(s_, iov, 3), SyscallSucceedsWithValue(2 * kPieceSize)); + } + EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0); +} + +TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) { + // Bind s_ to loopback:TestPort. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Send 2 packets from t_ to s_, where each packet's data consists of 2 + // discontiguous iovecs. + constexpr size_t kPieceSize = 100; + char buf[4 * kPieceSize]; + RandomizeBuffer(buf, sizeof(buf)); + + for (int i = 0; i < 2; i++) { + struct iovec iov[2]; + for (int j = 0; j < 2; j++) { + iov[j].iov_base = reinterpret_cast<void*>( + reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize); + iov[j].iov_len = kPieceSize; + } + struct msghdr msg = {}; + msg.msg_name = addr_[0]; + msg.msg_namelen = addrlen_; + msg.msg_iov = iov; + msg.msg_iovlen = 2; + ASSERT_THAT(sendmsg(t_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize)); + } + + // Receive the data as 2 separate packets. + char received[6 * kPieceSize]; + for (int i = 0; i < 2; i++) { + struct iovec iov[3]; + for (int j = 0; j < 3; j++) { + iov[j].iov_base = reinterpret_cast<void*>( + reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize); + iov[j].iov_len = kPieceSize; + } + struct msghdr msg = {}; + msg.msg_iov = iov; + msg.msg_iovlen = 3; + ASSERT_THAT(recvmsg(s_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize)); + } + EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0); +} + +TEST_P(UdpSocketTest, FIONREADShutdown) { + int n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); + + // A UDP socket must be connected before it can be shutdown. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); + + EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); + + n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); +} + +TEST_P(UdpSocketTest, FIONREADWriteShutdown) { + int n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); + + // Bind s_ to loopback:TestPort. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // A UDP socket must be connected before it can be shutdown. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); + + const char str[] = "abc"; + ASSERT_THAT(send(s_, str, sizeof(str), 0), + SyscallSucceedsWithValue(sizeof(str))); + + n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, sizeof(str)); + + EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); + + n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, sizeof(str)); +} + +TEST_P(UdpSocketTest, Fionread) { + // Bind s_ to loopback:TestPort. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Check that the bound socket with an empty buffer reports an empty first + // packet. + int n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); + + // Send 3 packets from t_ to s_. + constexpr int psize = 100; + char buf[3 * psize]; + RandomizeBuffer(buf, sizeof(buf)); + + for (int i = 0; i < 3; ++i) { + ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_), + SyscallSucceedsWithValue(psize)); + + // Check that regardless of how many packets are in the queue, the size + // reported is that of a single packet. + n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, psize); + } +} + +TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) { + // Bind s_ to loopback:TestPort. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // Check that the bound socket with an empty buffer reports an empty first + // packet. + int n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); + + // Send 3 packets from t_ to s_. + constexpr int psize = 100; + char buf[3 * psize]; + RandomizeBuffer(buf, sizeof(buf)); + + for (int i = 0; i < 3; ++i) { + ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_), + SyscallSucceedsWithValue(0)); + + // Check that regardless of how many packets are in the queue, the size + // reported is that of a single packet. + n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); + } +} + +TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) { + int n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); + + // Bind s_ to loopback:TestPort. + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + + // A UDP socket must be connected before it can be shutdown. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); + + const char str[] = "abc"; + ASSERT_THAT(send(s_, str, 0, 0), SyscallSucceedsWithValue(0)); + + n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); + + EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); + + n = -1; + EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); + EXPECT_EQ(n, 0); +} + +TEST_P(UdpSocketTest, SoTimestampOffByDefault) { + // TODO(gvisor.dev/issue/1202): SO_TIMESTAMP socket option not supported by + // hostinet. + SKIP_IF(IsRunningWithHostinet()); + + int v = -1; + socklen_t optlen = sizeof(v); + ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen), + SyscallSucceeds()); + ASSERT_EQ(v, kSockOptOff); + ASSERT_EQ(optlen, sizeof(v)); +} + +TEST_P(UdpSocketTest, SoTimestamp) { + // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not + // supported by hostinet. + SKIP_IF(IsRunningWithHostinet()); + + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); + + int v = 1; + ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)), + SyscallSucceeds()); + + char buf[3]; + // Send zero length packet from t_ to s_. + ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0)); + + char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))]; + msghdr msg; + memset(&msg, 0, sizeof(msg)); + iovec iov; + memset(&iov, 0, sizeof(iov)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsgbuf; + msg.msg_controllen = sizeof(cmsgbuf); + + ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0)); + + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, nullptr); + ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET); + ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP); + ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval))); + + struct timeval tv = {}; + memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval)); + + ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0); + + // There should be nothing to get via ioctl. + ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT)); +} + +TEST_P(UdpSocketTest, WriteShutdownNotConnected) { + EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN)); +} + +TEST_P(UdpSocketTest, TimestampIoctl) { + // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet. + SKIP_IF(IsRunningWithHostinet()); + + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); + + char buf[3]; + // Send packet from t_ to s_. + ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)), + SyscallSucceedsWithValue(sizeof(buf))); + + // There should be no control messages. + char recv_buf[sizeof(buf)]; + ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf))); + + // A nonzero timeval should be available via ioctl. + struct timeval tv = {}; + ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds()); + ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0); +} + +TEST_P(UdpSocketTest, TimestampIoctlNothingRead) { + // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet. + SKIP_IF(IsRunningWithHostinet()); + + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); + + struct timeval tv = {}; + ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT)); +} + +// Test that the timestamp accessed via SIOCGSTAMP is still accessible after +// SO_TIMESTAMP is enabled and used to retrieve a timestamp. +TEST_P(UdpSocketTest, TimestampIoctlPersistence) { + // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not + // supported by hostinet. + SKIP_IF(IsRunningWithHostinet()); + + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); + + char buf[3]; + // Send packet from t_ to s_. + ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)), + SyscallSucceedsWithValue(sizeof(buf))); + ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0)); + + // There should be no control messages. + char recv_buf[sizeof(buf)]; + ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf))); + + // A nonzero timeval should be available via ioctl. + struct timeval tv = {}; + ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds()); + ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0); + + // Enable SO_TIMESTAMP and send a message. + int v = 1; + EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)), + SyscallSucceeds()); + ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0)); + + // There should be a message for SO_TIMESTAMP. + char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))]; + msghdr msg = {}; + iovec iov = {}; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsgbuf; + msg.msg_controllen = sizeof(cmsgbuf); + ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0)); + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, nullptr); + + // The ioctl should return the exact same values as before. + struct timeval tv2 = {}; + ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv2), SyscallSucceeds()); + ASSERT_EQ(tv.tv_sec, tv2.tv_sec); + ASSERT_EQ(tv.tv_usec, tv2.tv_usec); +} + +// Test that a socket with IP_TOS or IPV6_TCLASS set will set the TOS byte on +// outgoing packets, and that a receiving socket with IP_RECVTOS or +// IPV6_RECVTCLASS will create the corresponding control message. +TEST_P(UdpSocketTest, SetAndReceiveTOS) { + // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack. + SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet()); + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); + + // Allow socket to receive control message. + int recv_level = SOL_IP; + int recv_type = IP_RECVTOS; + if (GetParam() != AddressFamily::kIpv4) { + recv_level = SOL_IPV6; + recv_type = IPV6_RECVTCLASS; + } + ASSERT_THAT( + setsockopt(s_, recv_level, recv_type, &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + + // Set socket TOS. + int sent_level = recv_level; + int sent_type = IP_TOS; + if (sent_level == SOL_IPV6) { + sent_type = IPV6_TCLASS; + } + int sent_tos = IPTOS_LOWDELAY; // Choose some TOS value. + ASSERT_THAT( + setsockopt(t_, sent_level, sent_type, &sent_tos, sizeof(sent_tos)), + SyscallSucceeds()); + + // Prepare message to send. + constexpr size_t kDataLength = 1024; + struct msghdr sent_msg = {}; + struct iovec sent_iov = {}; + char sent_data[kDataLength]; + sent_iov.iov_base = &sent_data[0]; + sent_iov.iov_len = kDataLength; + sent_msg.msg_iov = &sent_iov; + sent_msg.msg_iovlen = 1; + + ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0), + SyscallSucceedsWithValue(kDataLength)); + + // Receive message. + struct msghdr received_msg = {}; + struct iovec received_iov = {}; + char received_data[kDataLength]; + received_iov.iov_base = &received_data[0]; + received_iov.iov_len = kDataLength; + received_msg.msg_iov = &received_iov; + received_msg.msg_iovlen = 1; + size_t cmsg_data_len = sizeof(int8_t); + if (sent_type == IPV6_TCLASS) { + cmsg_data_len = sizeof(int); + } + std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len)); + received_msg.msg_control = &received_cmsgbuf[0]; + received_msg.msg_controllen = received_cmsgbuf.size(); + ASSERT_THAT(RetryEINTR(recvmsg)(s_, &received_msg, 0), + SyscallSucceedsWithValue(kDataLength)); + + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg); + ASSERT_NE(cmsg, nullptr); + EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len)); + EXPECT_EQ(cmsg->cmsg_level, sent_level); + EXPECT_EQ(cmsg->cmsg_type, sent_type); + int8_t received_tos = 0; + memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos)); + EXPECT_EQ(received_tos, sent_tos); +} + +// Test that sendmsg with IP_TOS and IPV6_TCLASS control messages will set the +// TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or +// IPV6_RECVTCLASS will create the corresponding control message. +TEST_P(UdpSocketTest, SendAndReceiveTOS) { + // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack. + SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet()); + ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); + + // Allow socket to receive control message. + int recv_level = SOL_IP; + int recv_type = IP_RECVTOS; + if (GetParam() != AddressFamily::kIpv4) { + recv_level = SOL_IPV6; + recv_type = IPV6_RECVTCLASS; + } + int recv_opt = kSockOptOn; + ASSERT_THAT( + setsockopt(s_, recv_level, recv_type, &recv_opt, sizeof(recv_opt)), + SyscallSucceeds()); + + // Prepare message to send. + constexpr size_t kDataLength = 1024; + int sent_level = recv_level; + int sent_type = IP_TOS; + int sent_tos = IPTOS_LOWDELAY; // Choose some TOS value. + + struct msghdr sent_msg = {}; + struct iovec sent_iov = {}; + char sent_data[kDataLength]; + sent_iov.iov_base = &sent_data[0]; + sent_iov.iov_len = kDataLength; + sent_msg.msg_iov = &sent_iov; + sent_msg.msg_iovlen = 1; + size_t cmsg_data_len = sizeof(int8_t); + if (sent_level == SOL_IPV6) { + sent_type = IPV6_TCLASS; + cmsg_data_len = sizeof(int); + } + std::vector<char> sent_cmsgbuf(CMSG_SPACE(cmsg_data_len)); + sent_msg.msg_control = &sent_cmsgbuf[0]; + sent_msg.msg_controllen = CMSG_LEN(cmsg_data_len); + + // Manually add control message. + struct cmsghdr* sent_cmsg = CMSG_FIRSTHDR(&sent_msg); + sent_cmsg->cmsg_len = CMSG_LEN(cmsg_data_len); + sent_cmsg->cmsg_level = sent_level; + sent_cmsg->cmsg_type = sent_type; + *(int8_t*)CMSG_DATA(sent_cmsg) = sent_tos; + + ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0), + SyscallSucceedsWithValue(kDataLength)); + + // Receive message. + struct msghdr received_msg = {}; + struct iovec received_iov = {}; + char received_data[kDataLength]; + received_iov.iov_base = &received_data[0]; + received_iov.iov_len = kDataLength; + received_msg.msg_iov = &received_iov; + received_msg.msg_iovlen = 1; + std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len)); + received_msg.msg_control = &received_cmsgbuf[0]; + received_msg.msg_controllen = CMSG_LEN(cmsg_data_len); + ASSERT_THAT(RetryEINTR(recvmsg)(s_, &received_msg, 0), + SyscallSucceedsWithValue(kDataLength)); + + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg); + ASSERT_NE(cmsg, nullptr); + EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len)); + EXPECT_EQ(cmsg->cmsg_level, sent_level); + EXPECT_EQ(cmsg->cmsg_type, sent_type); + int8_t received_tos = 0; + memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos)); + EXPECT_EQ(received_tos, sent_tos); +} + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/udp_socket_test_cases.h b/test/syscalls/linux/udp_socket_test_cases.h new file mode 100644 index 000000000..2fd79d99e --- /dev/null +++ b/test/syscalls/linux/udp_socket_test_cases.h @@ -0,0 +1,74 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_ +#define THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_ + +#include "gtest/gtest.h" +#include "test/syscalls/linux/socket_test_util.h" + +namespace gvisor { +namespace testing { + +// The initial port to be be used on gvisor. +constexpr int TestPort = 40000; + +// Fixture for tests parameterized by the address family to use (AF_INET and +// AF_INET6) when creating sockets. +class UdpSocketTest + : public ::testing::TestWithParam<gvisor::testing::AddressFamily> { + protected: + // Creates two sockets that will be used by test cases. + void SetUp() override; + + // Closes the sockets created by SetUp(). + void TearDown() override { + EXPECT_THAT(close(s_), SyscallSucceeds()); + EXPECT_THAT(close(t_), SyscallSucceeds()); + + for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) { + ASSERT_NO_ERRNO(FreeAvailablePort(ports_[i])); + } + } + + // First UDP socket. + int s_; + + // Second UDP socket. + int t_; + + // The length of the socket address. + socklen_t addrlen_; + + // Initialized address pointing to loopback and port TestPort+i. + struct sockaddr* addr_[3]; + + // Initialize "any" address. + struct sockaddr* anyaddr_; + + // Used ports. + int ports_[3]; + + private: + // Storage for the loopback addresses. + struct sockaddr_storage addr_storage_[3]; + + // Storage for the "any" address. + struct sockaddr_storage anyaddr_storage_; +}; + +} // namespace testing +} // namespace gvisor + +#endif // THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_ diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc new file mode 100644 index 000000000..75740238c --- /dev/null +++ b/test/syscalls/linux/xattr.cc @@ -0,0 +1,488 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sys/types.h> +#include <sys/xattr.h> +#include <unistd.h> + +#include <string> +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "test/syscalls/linux/file_base.h" +#include "test/util/capability_util.h" +#include "test/util/posix_error.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +class XattrTest : public FileTest {}; + +TEST_F(XattrTest, XattrNullName) { + const char* path = test_file_name_.c_str(); + + EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0), + SyscallFailsWithErrno(EFAULT)); + EXPECT_THAT(getxattr(path, nullptr, nullptr, 0), + SyscallFailsWithErrno(EFAULT)); +} + +TEST_F(XattrTest, XattrEmptyName) { + const char* path = test_file_name_.c_str(); + + EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0), + SyscallFailsWithErrno(ERANGE)); + EXPECT_THAT(getxattr(path, "", nullptr, 0), SyscallFailsWithErrno(ERANGE)); +} + +TEST_F(XattrTest, XattrLargeName) { + const char* path = test_file_name_.c_str(); + std::string name = "user."; + name += std::string(XATTR_NAME_MAX - name.length(), 'a'); + + // TODO(b/127675828): Support setxattr and getxattr. + if (!IsRunningOnGvisor()) { + EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0), + SyscallSucceeds()); + EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0), + SyscallSucceedsWithValue(0)); + } + + name += "a"; + EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0), + SyscallFailsWithErrno(ERANGE)); + EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0), + SyscallFailsWithErrno(ERANGE)); +} + +TEST_F(XattrTest, XattrInvalidPrefix) { + const char* path = test_file_name_.c_str(); + std::string name(XATTR_NAME_MAX, 'a'); + EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0), + SyscallFailsWithErrno(EOPNOTSUPP)); + EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0), + SyscallFailsWithErrno(EOPNOTSUPP)); +} + +TEST_F(XattrTest, XattrReadOnly) { + // Drop capabilities that allow us to override file and directory permissions. + ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false)); + ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false)); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + char val = 'a'; + size_t size = sizeof(val); + + // TODO(b/127675828): Support setxattr and getxattr. + if (!IsRunningOnGvisor()) { + EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), + SyscallSucceeds()); + } + + ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IRUSR)); + + EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), + SyscallFailsWithErrno(EACCES)); + + // TODO(b/127675828): Support setxattr and getxattr. + if (!IsRunningOnGvisor()) { + char buf = '-'; + EXPECT_THAT(getxattr(path, name, &buf, size), + SyscallSucceedsWithValue(size)); + EXPECT_EQ(buf, val); + } +} + +TEST_F(XattrTest, XattrWriteOnly) { + // Drop capabilities that allow us to override file and directory permissions. + ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false)); + ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false)); + + ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IWUSR)); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + char val = 'a'; + size_t size = sizeof(val); + + // TODO(b/127675828): Support setxattr and getxattr. + if (!IsRunningOnGvisor()) { + EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), + SyscallSucceeds()); + } + + EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES)); +} + +TEST_F(XattrTest, XattrTrustedWithNonadmin) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + const char* path = test_file_name_.c_str(); + const char name[] = "trusted.abc"; + EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), + SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA)); +} + +TEST_F(XattrTest, XattrOnDirectory) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + char name[] = "user.abc"; + EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0), + SyscallSucceeds()); + EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0), + SyscallSucceedsWithValue(0)); +} + +TEST_F(XattrTest, XattrOnSymlink) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + TempPath link = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(dir.path(), test_file_name_)); + char name[] = "user.abc"; + EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0), + SyscallSucceeds()); + EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0), + SyscallSucceedsWithValue(0)); +} + +TEST_F(XattrTest, XattrOnInvalidFileTypes) { + char name[] = "user.abc"; + + char char_device[] = "/dev/zero"; + EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0), + SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(getxattr(char_device, name, NULL, 0), + SyscallFailsWithErrno(ENODATA)); + + // Use tmpfs, where creation of named pipes is supported. + const std::string fifo = NewTempAbsPathInDir("/dev/shm"); + const char* path = fifo.c_str(); + EXPECT_THAT(mknod(path, S_IFIFO | S_IRUSR | S_IWUSR, 0), SyscallSucceeds()); + EXPECT_THAT(setxattr(path, name, NULL, 0, /*flags=*/0), + SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(getxattr(path, name, NULL, 0), SyscallFailsWithErrno(ENODATA)); +} + +TEST_F(XattrTest, SetxattrSizeSmallerThanValue) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + std::vector<char> val = {'a', 'a'}; + size_t size = 1; + EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0), + SyscallSucceeds()); + + std::vector<char> buf = {'-', '-'}; + std::vector<char> expected_buf = {'a', '-'}; + EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()), + SyscallSucceedsWithValue(size)); + EXPECT_EQ(buf, expected_buf); +} + +TEST_F(XattrTest, SetxattrZeroSize) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + char val = 'a'; + EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds()); + + char buf = '-'; + EXPECT_THAT(getxattr(path, name, &buf, XATTR_SIZE_MAX), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(buf, '-'); +} + +TEST_F(XattrTest, SetxattrSizeTooLarge) { + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + + // Note that each particular fs implementation may stipulate a lower size + // limit, in which case we actually may fail (e.g. error with ENOSPC) for + // some sizes under XATTR_SIZE_MAX. + size_t size = XATTR_SIZE_MAX + 1; + std::vector<char> val(size); + EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0), + SyscallFailsWithErrno(E2BIG)); + + // TODO(b/127675828): Support setxattr and getxattr. + if (!IsRunningOnGvisor()) { + EXPECT_THAT(getxattr(path, name, nullptr, 0), + SyscallFailsWithErrno(ENODATA)); + } +} + +TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) { + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0), + SyscallFailsWithErrno(EFAULT)); + + // TODO(b/127675828): Support setxattr and getxattr. + if (!IsRunningOnGvisor()) { + EXPECT_THAT(getxattr(path, name, nullptr, 0), + SyscallFailsWithErrno(ENODATA)); + } +} + +TEST_F(XattrTest, SetxattrNullValueAndZeroSize) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds()); + + EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0)); +} + +TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + std::vector<char> val(XATTR_SIZE_MAX + 1); + std::fill(val.begin(), val.end(), 'a'); + size_t size = 1; + EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0), + SyscallSucceeds()); + + std::vector<char> buf = {'-', '-'}; + std::vector<char> expected_buf = {'a', '-'}; + EXPECT_THAT(getxattr(path, name, buf.data(), size), + SyscallSucceedsWithValue(size)); + EXPECT_EQ(buf, expected_buf); +} + +TEST_F(XattrTest, SetxattrReplaceWithSmaller) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + std::vector<char> val = {'a', 'a'}; + EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0), + SyscallSucceeds()); + EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0), + SyscallSucceeds()); + + std::vector<char> buf = {'-', '-'}; + std::vector<char> expected_buf = {'a', '-'}; + EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(1)); + EXPECT_EQ(buf, expected_buf); +} + +TEST_F(XattrTest, SetxattrReplaceWithLarger) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + std::vector<char> val = {'a', 'a'}; + EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0), + SyscallSucceeds()); + EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0), + SyscallSucceeds()); + + std::vector<char> buf = {'-', '-'}; + EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(2)); + EXPECT_EQ(buf, val); +} + +TEST_F(XattrTest, SetxattrCreateFlag) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE), + SyscallSucceeds()); + EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE), + SyscallFailsWithErrno(EEXIST)); + + EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0)); +} + +TEST_F(XattrTest, SetxattrReplaceFlag) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE), + SyscallFailsWithErrno(ENODATA)); + EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds()); + EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE), + SyscallSucceeds()); + + EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0)); +} + +TEST_F(XattrTest, SetxattrInvalidFlags) { + const char* path = test_file_name_.c_str(); + int invalid_flags = 0xff; + EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags), + SyscallFailsWithErrno(EINVAL)); +} + +TEST_F(XattrTest, Getxattr) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + int val = 1234; + size_t size = sizeof(val); + EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds()); + + int buf = 0; + EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size)); + EXPECT_EQ(buf, val); +} + +TEST_F(XattrTest, GetxattrSizeSmallerThanValue) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + std::vector<char> val = {'a', 'a'}; + size_t size = val.size(); + EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds()); + + char buf = '-'; + EXPECT_THAT(getxattr(path, name, &buf, 1), SyscallFailsWithErrno(ERANGE)); + EXPECT_EQ(buf, '-'); +} + +TEST_F(XattrTest, GetxattrSizeLargerThanValue) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + char val = 'a'; + EXPECT_THAT(setxattr(path, name, &val, 1, /*flags=*/0), SyscallSucceeds()); + + std::vector<char> buf(XATTR_SIZE_MAX); + std::fill(buf.begin(), buf.end(), '-'); + std::vector<char> expected_buf = buf; + expected_buf[0] = 'a'; + EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()), + SyscallSucceedsWithValue(1)); + EXPECT_EQ(buf, expected_buf); +} + +TEST_F(XattrTest, GetxattrZeroSize) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + char val = 'a'; + EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0), + SyscallSucceeds()); + + char buf = '-'; + EXPECT_THAT(getxattr(path, name, &buf, 0), + SyscallSucceedsWithValue(sizeof(val))); + EXPECT_EQ(buf, '-'); +} + +TEST_F(XattrTest, GetxattrSizeTooLarge) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + char val = 'a'; + EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0), + SyscallSucceeds()); + + std::vector<char> buf(XATTR_SIZE_MAX + 1); + std::fill(buf.begin(), buf.end(), '-'); + std::vector<char> expected_buf = buf; + expected_buf[0] = 'a'; + EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()), + SyscallSucceedsWithValue(sizeof(val))); + EXPECT_EQ(buf, expected_buf); +} + +TEST_F(XattrTest, GetxattrNullValue) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + char val = 'a'; + size_t size = sizeof(val); + EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds()); + + EXPECT_THAT(getxattr(path, name, nullptr, size), + SyscallFailsWithErrno(EFAULT)); +} + +TEST_F(XattrTest, GetxattrNullValueAndZeroSize) { + // TODO(b/127675828): Support setxattr and getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + char name[] = "user.abc"; + char val = 'a'; + size_t size = sizeof(val); + // Set value with zero size. + EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds()); + // Get value with nonzero size. + EXPECT_THAT(getxattr(path, name, nullptr, size), SyscallSucceedsWithValue(0)); + + // Set value with nonzero size. + EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds()); + // Get value with zero size. + EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(size)); +} + +TEST_F(XattrTest, GetxattrNonexistentName) { + // TODO(b/127675828): Support getxattr. + SKIP_IF(IsRunningOnGvisor()); + + const char* path = test_file_name_.c_str(); + std::string name = "user.nonexistent"; + EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0), + SyscallFailsWithErrno(ENODATA)); +} + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go index accf46347..b9fd885ff 100644 --- a/test/syscalls/syscall_test_runner.go +++ b/test/syscalls/syscall_test_runner.go @@ -46,6 +46,7 @@ var ( debug = flag.Bool("debug", false, "enable debug logs") strace = flag.Bool("strace", false, "enable strace logs") platform = flag.String("platform", "ptrace", "platform to run on") + network = flag.String("network", "none", "network stack to run on (sandbox, host, none)") useTmpfs = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp") fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode") overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay") @@ -137,7 +138,7 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error { args := []string{ "-root", rootDir, - "-network=none", + "-network", *network, "-log-format=text", "-TESTONLY-unsafe-nonroot=true", "-net-raw=true", @@ -335,10 +336,11 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) { }) } - // Set environment variable that indicates we are - // running in gVisor and with the given platform. + // Set environment variables that indicate we are + // running in gVisor with the given platform and network. platformVar := "TEST_ON_GVISOR" - env := append(os.Environ(), platformVar+"="+*platform) + networkVar := "GVISOR_NETWORK" + env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network) // Remove env variables that cause the gunit binary to write output // files, since they will stomp on eachother, and on the output files diff --git a/test/util/BUILD b/test/util/BUILD index 4526bb3f1..cbc728159 100644 --- a/test/util/BUILD +++ b/test/util/BUILD @@ -237,6 +237,7 @@ cc_library( ] + select_for_linux( [ "test_util_impl.cc", + "test_util_runfiles.cc", ], ), hdrs = ["test_util.h"], @@ -245,6 +246,7 @@ cc_library( ":logging", ":posix_error", ":save_util", + "@bazel_tools//tools/cpp/runfiles", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/flags:flag", "@com_google_absl//absl/flags:parse", diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc index 88b1e7911..042cec94a 100644 --- a/test/util/fs_util.cc +++ b/test/util/fs_util.cc @@ -105,6 +105,15 @@ PosixErrorOr<struct stat> Stat(absl::string_view path) { return stat_buf; } +PosixErrorOr<struct stat> Lstat(absl::string_view path) { + struct stat stat_buf; + int res = lstat(std::string(path).c_str(), &stat_buf); + if (res < 0) { + return PosixError(errno, absl::StrCat("lstat ", path)); + } + return stat_buf; +} + PosixErrorOr<struct stat> Fstat(int fd) { struct stat stat_buf; int res = fstat(fd, &stat_buf); @@ -127,7 +136,7 @@ PosixErrorOr<bool> Exists(absl::string_view path) { } PosixErrorOr<bool> IsDirectory(absl::string_view path) { - ASSIGN_OR_RETURN_ERRNO(struct stat stat_buf, Stat(path)); + ASSIGN_OR_RETURN_ERRNO(struct stat stat_buf, Lstat(path)); if (S_ISDIR(stat_buf.st_mode)) { return true; } diff --git a/test/util/test_util.cc b/test/util/test_util.cc index 9cb050735..848504c88 100644 --- a/test/util/test_util.cc +++ b/test/util/test_util.cc @@ -41,6 +41,7 @@ namespace gvisor { namespace testing { #define TEST_ON_GVISOR "TEST_ON_GVISOR" +#define GVISOR_NETWORK "GVISOR_NETWORK" bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; } @@ -60,6 +61,11 @@ Platform GvisorPlatform() { abort(); } +bool IsRunningWithHostinet() { + char* env = getenv(GVISOR_NETWORK); + return env && strcmp(env, "host") == 0; +} + // Inline cpuid instruction. Preserve %ebx/%rbx register. In PIC compilations // %ebx contains the address of the global offset table. %rbx is occasionally // used to address stack variables in presence of dynamic allocas. diff --git a/test/util/test_util.h b/test/util/test_util.h index dc30575b8..b3235c7e3 100644 --- a/test/util/test_util.h +++ b/test/util/test_util.h @@ -220,6 +220,7 @@ enum class Platform { }; bool IsRunningOnGvisor(); Platform GvisorPlatform(); +bool IsRunningWithHostinet(); #ifdef __linux__ void SetupGvisorDeathTest(); @@ -764,6 +765,12 @@ MATCHER_P2(EquivalentWithin, target, tolerance, return Equivalent(arg, target, tolerance); } +// Returns the absolute path to the a data dependency. 'path' is the runfile +// location relative to workspace root. +#ifdef __linux__ +std::string RunfilePath(std::string path); +#endif + void TestInit(int* argc, char*** argv); } // namespace testing diff --git a/test/util/test_util_runfiles.cc b/test/util/test_util_runfiles.cc new file mode 100644 index 000000000..7210094eb --- /dev/null +++ b/test/util/test_util_runfiles.cc @@ -0,0 +1,46 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <iostream> +#include <string> + +#include "test/util/fs_util.h" +#include "test/util/test_util.h" +#include "tools/cpp/runfiles/runfiles.h" + +namespace gvisor { +namespace testing { + +std::string RunfilePath(std::string path) { + static const bazel::tools::cpp::runfiles::Runfiles* const runfiles = [] { + std::string error; + auto* runfiles = + bazel::tools::cpp::runfiles::Runfiles::CreateForTest(&error); + if (runfiles == nullptr) { + std::cerr << "Unable to find runfiles: " << error << std::endl; + } + return runfiles; + }(); + + if (!runfiles) { + // Can't find runfiles? This probably won't work, but __main__/path is our + // best guess. + return JoinPath("__main__", path); + } + + return runfiles->Rlocation(JoinPath("__main__", path)); +} + +} // namespace testing +} // namespace gvisor diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl index 3ce36c1c8..33267c074 100644 --- a/tools/go_stateify/defs.bzl +++ b/tools/go_stateify/defs.bzl @@ -43,12 +43,13 @@ def _go_stateify_impl(ctx): # Run the stateify command. args = ["-output=%s" % output.path] - args += ["-pkg=%s" % ctx.attr.package] + args.append("-pkg=%s" % ctx.attr.package) + args.append("-arch=%s" % ctx.attr.arch) if ctx.attr._statepkg: - args += ["-statepkg=%s" % ctx.attr._statepkg] + args.append("-statepkg=%s" % ctx.attr._statepkg) if ctx.attr.imports: - args += ["-imports=%s" % ",".join(ctx.attr.imports)] - args += ["--"] + args.append("-imports=%s" % ",".join(ctx.attr.imports)) + args.append("--") for src in ctx.attr.srcs: args += [f.path for f in src.files.to_list()] ctx.actions.run( @@ -83,6 +84,10 @@ for statified types. doc = "The package name for the input sources.", mandatory = True, ), + "arch": attr.string( + doc = "Target platform.", + mandatory = True, + ), "out": attr.output( doc = """ The name of the generated file output. This must not conflict with any other @@ -118,6 +123,10 @@ def go_library(name, srcs, deps = [], imports = [], **kwargs): srcs = [src for src in srcs if src.endswith(".go")], imports = imports, package = name, + arch = select({ + "@bazel_tools//src/conditions:linux_aarch64": "arm64", + "//conditions:default": "amd64", + }), out = name + "_state_autogen.go", ) all_srcs = srcs + [name + "_state_autogen.go"] diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go index db7a7107b..7d5d291e6 100644 --- a/tools/go_stateify/main.go +++ b/tools/go_stateify/main.go @@ -22,7 +22,9 @@ import ( "go/ast" "go/parser" "go/token" + "io/ioutil" "os" + "path/filepath" "reflect" "strings" "sync" @@ -33,8 +35,113 @@ var ( imports = flag.String("imports", "", "extra imports for the output file") output = flag.String("output", "", "output file") statePkg = flag.String("statepkg", "", "state import package; defaults to empty") + arch = flag.String("arch", "", "specify the target platform") ) +// The known architectures. +var okgoarch = []string{ + "386", + "amd64", + "arm", + "arm64", + "mips", + "mipsle", + "mips64", + "mips64le", + "ppc64", + "ppc64le", + "riscv64", + "s390x", + "sparc64", + "wasm", +} + +// readfile returns the content of the named file. +func readfile(file string) string { + data, err := ioutil.ReadFile(file) + if err != nil { + panic(fmt.Sprintf("readfile err: %v", err)) + } + return string(data) +} + +// matchfield reports whether the field (x,y,z) matches this build. +// all the elements in the field must be satisfied. +func matchfield(f string, goarch string) bool { + for _, tag := range strings.Split(f, ",") { + if !matchtag(tag, goarch) { + return false + } + } + return true +} + +// matchtag reports whether the tag (x or !x) matches this build. +func matchtag(tag string, goarch string) bool { + if tag == "" { + return false + } + if tag[0] == '!' { + if len(tag) == 1 || tag[1] == '!' { + return false + } + return !matchtag(tag[1:], goarch) + } + return tag == goarch +} + +// canBuild reports whether we can build this file for target platform by +// checking file name and build tags. The code is derived from the Go source +// cmd.dist.build.shouldbuild. +func canBuild(file, goTargetArch string) bool { + name := filepath.Base(file) + excluded := func(list []string, ok string) bool { + for _, x := range list { + if x == ok || (ok == "android" && x == "linux") || (ok == "illumos" && x == "solaris") { + continue + } + i := strings.Index(name, x) + if i <= 0 || name[i-1] != '_' { + continue + } + i += len(x) + if i == len(name) || name[i] == '.' || name[i] == '_' { + return true + } + } + return false + } + if excluded(okgoarch, goTargetArch) { + return false + } + + // Check file contents for // +build lines. + for _, p := range strings.Split(readfile(file), "\n") { + p = strings.TrimSpace(p) + if p == "" { + continue + } + if !strings.HasPrefix(p, "//") { + break + } + if !strings.Contains(p, "+build") { + continue + } + fields := strings.Fields(p[2:]) + if len(fields) < 1 || fields[0] != "+build" { + continue + } + for _, p := range fields[1:] { + if matchfield(p, goTargetArch) { + goto fieldmatch + } + } + return false + fieldmatch: + } + return true +} + // resolveTypeName returns a qualified type name. func resolveTypeName(name string, typ ast.Expr) (field string, qualified string) { for done := false; !done; { @@ -256,6 +363,11 @@ func main() { fmt.Fprintf(os.Stderr, "Input %q can't be parsed: %v\n", filename, err) os.Exit(1) } + + if !canBuild(filename, *arch) { + continue + } + files = append(files, f) } diff --git a/vdso/BUILD b/vdso/BUILD index 7ceed349e..2b6744c26 100644 --- a/vdso/BUILD +++ b/vdso/BUILD @@ -68,14 +68,14 @@ genrule( "&& $(location :check_vdso) " + "--check-data " + "--vdso $(location vdso.so) ", + exec_tools = [ + ":check_vdso", + ], features = ["-pie"], toolchains = [ "@bazel_tools//tools/cpp:current_cc_toolchain", ":no_pie_cc_flags", ], - tools = [ - ":check_vdso", - ], visibility = ["//:sandbox"], ) @@ -87,6 +87,6 @@ cc_flags_supplier( py_binary( name = "check_vdso", srcs = ["check_vdso.py"], - python_version = "PY2", + python_version = "PY3", visibility = ["//:sandbox"], ) |