diff options
151 files changed, 4989 insertions, 1531 deletions
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 000000000..cf782a580 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,21 @@ +name: "Build" +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + default: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/cache@v1 + with: + path: ~/.cache/bazel + key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} + restore-keys: | + ${{ runner.os }}-bazel- + - run: make diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml new file mode 100644 index 000000000..60704f144 --- /dev/null +++ b/.github/workflows/go.yml @@ -0,0 +1,63 @@ +name: "Go" +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + generate: + runs-on: ubuntu-latest + steps: + - run: | + jq -nc '{"state": "pending", "context": "go tests"}' | \ + curl -sL -X POST -d @- \ + -H "Content-Type: application/json" \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + "${{ github.event.pull_request.statuses_url }}" + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: actions/setup-go@v2 + with: + go-version: 1.14 + - uses: actions/cache@v1 + with: + path: ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go- + - uses: actions/cache@v1 + with: + path: ~/.cache/bazel + key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} + restore-keys: | + ${{ runner.os }}-bazel- + - run: make build TARGETS="//:gopath" + - run: tools/go_branch.sh + - run: git checkout go && git clean -f + - run: go build ./... + - if: github.event_name == 'push' + run: | + # Required dedicated credentials for the Go branch, due to the way + # branch protection rules are configured. + git config --global credential.helper cache + echo -e "protocol=https\nhost=github.com\nusername=${{ secrets.GO_TOKEN }}\npassword=x-oauth-basic" | git credential approve + git remote add upstream "https://github.com/${{ github.repository }}" + git push upstream go:go + - if: ${{ success() }} + run: | + jq -nc '{"state": "success", "context": "go tests"}' | \ + curl -sL -X POST -d @- \ + -H "Content-Type: application/json" \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + "${{ github.event.pull_request.statuses_url }}" + - if: ${{ failure() }} + run: | + jq -nc '{"state": "failure", "context": "go tests"}' | \ + curl -sL -X POST -d @- \ + -H "Content-Type: application/json" \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + "${{ github.event.pull_request.statuses_url }}" diff --git a/.github/workflows/issue_reviver.yml b/.github/workflows/issue_reviver.yml new file mode 100644 index 000000000..5e0254111 --- /dev/null +++ b/.github/workflows/issue_reviver.yml @@ -0,0 +1,14 @@ +name: "Issue reviver" +on: + schedule: + - cron: '0 0 * * *' + +jobs: + label: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - run: make run TARGETS="//tools/issue_reviver" + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REPOSITORY: ${{ github.repository }} @@ -116,7 +116,7 @@ unit-tests: ## Runs all unit tests in pkg runsc and tools. .PHONY: unit-tests tests: ## Runs all local ptrace system call tests. - @$(MAKE) test OPTIONS="--test_tag_filter runsc_ptrace test/syscalls/..." + @$(MAKE) test OPTIONS="--test_tag_filters runsc_ptrace test/syscalls/..." .PHONY: tests ## @@ -1,12 +1,11 @@ ![gVisor](g3doc/logo.png) -[![Status](https://storage.googleapis.com/gvisor-build-badges/build.svg)](https://storage.googleapis.com/gvisor-build-badges/build.html) -[![gVisor chat](https://badges.gitter.im/gvisor/community.png)](https://gitter.im/gvisor/community) +![](https://github.com/google/gvisor/workflows/Build/badge.svg) ## What is gVisor? -**gVisor** is a user-space kernel, written in Go, that implements a substantial -portion of the Linux system surface. It includes an +**gVisor** is an application kernel, written in Go, that implements a +substantial portion of the Linux system surface. It includes an [Open Container Initiative (OCI)][oci] runtime called `runsc` that provides an isolation boundary between the application and the host kernel. The `runsc` runtime integrates with Docker and Kubernetes, making it simple to run sandboxed @@ -15,16 +14,17 @@ containers. ## Why does gVisor exist? Containers are not a [**sandbox**][sandbox]. While containers have -revolutionized how we develop, package, and deploy applications, running -untrusted or potentially malicious code without additional isolation is not a -good idea. The efficiency and performance gains from using a single, shared -kernel also mean that container escape is possible with a single vulnerability. - -gVisor is a user-space kernel for containers. It limits the host kernel surface -accessible to the application while still giving the application access to all -the features it expects. Unlike most kernels, gVisor does not assume or require -a fixed set of physical resources; instead, it leverages existing host kernel -functionality and runs as a normal user-space process. In other words, gVisor +revolutionized how we develop, package, and deploy applications, using them to +run untrusted or potentially malicious code without additional isolation is not +a good idea. While using a single, shared kernel allows for efficiency and +performance gains, it also means that container escape is possible with a single +vulnerability. + +gVisor is an application kernel for containers. It limits the host kernel +surface accessible to the application while still giving the application access +to all the features it expects. Unlike most kernels, gVisor does not assume or +require a fixed set of physical resources; instead, it leverages existing host +kernel functionality and runs as a normal process. In other words, gVisor implements Linux by way of Linux. gVisor should not be confused with technologies and tools to harden containers @@ -39,75 +39,44 @@ be found at [gvisor.dev][gvisor-dev]. ## Installing from source -gVisor currently requires x86\_64 Linux to build, though support for other -architectures may become available in the future. +gVisor builds on x86_64 and ARM64. Other architectures may become available in +the future. + +For the purposes of these instructions, [bazel][bazel] and other build +dependencies are wrapped in a build container. It is possible to use +[bazel][bazel] directly, or type `make help` for standard targets. ### Requirements Make sure the following dependencies are installed: * Linux 4.14.77+ ([older linux][old-linux]) -* [git][git] -* [Bazel][bazel] 1.2+ -* [Python][python] * [Docker version 17.09.0 or greater][docker] -* C++ toolchain supporting C++17 (GCC 7+, Clang 5+) -* Gold linker (e.g. `binutils-gold` package on Ubuntu) ### Building Build and install the `runsc` binary: ``` -bazel build runsc -sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin -``` - -If you don't want to install bazel on your system, you can build runsc in a -Docker container: - -``` make runsc sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin ``` ### Testing -The test suite can be run with Bazel: - -``` -bazel test //... -``` - -or in a Docker container: +To run standard test suites, you can use: ``` make unit-tests make tests ``` -### Using remote execution - -If you have a [Remote Build Execution][rbe] environment, you can use it to speed -up build and test cycles. - -You must authenticate with the project first: +To run specific tests, you can specify the target: ``` -gcloud auth application-default login --no-launch-browser +make test TARGETS="//runsc:version_test" ``` -Then invoke bazel with the following flags: - -``` ---config=remote ---project_id=$PROJECT ---remote_instance_name=projects/$PROJECT/instances/default_instance -``` - -You can also add those flags to your local ~/.bazelrc to avoid needing to -specify them each time on the command line. - ### Using `go get` This project uses [bazel][bazel] to build and manage dependencies. A synthetic @@ -128,7 +97,7 @@ development on this branch is not supported. Development should occur on the ## Community & Governance -The governance model is documented in our [community][community] repository. +See [GOVERNANCE.md](GOVERNANCE.md) for project governance information. The [gvisor-users mailing list][gvisor-users-list] and [gvisor-dev mailing list][gvisor-dev-list] are good starting points for @@ -145,12 +114,9 @@ See [Contributing.md](CONTRIBUTING.md). [bazel]: https://bazel.build [community]: https://gvisor.googlesource.com/community [docker]: https://www.docker.com -[git]: https://git-scm.com [gvisor-users-list]: https://groups.google.com/forum/#!forum/gvisor-users +[gvisor-dev]: https://gvisor.dev [gvisor-dev-list]: https://groups.google.com/forum/#!forum/gvisor-dev [oci]: https://www.opencontainers.org [old-linux]: https://gvisor.dev/docs/user_guide/networking/#gso -[python]: https://python.org -[rbe]: https://blog.bazel.build/2018/10/05/remote-build-execution.html [sandbox]: https://en.wikipedia.org/wiki/Sandbox_(computer_security) -[gvisor-dev]: https://gvisor.dev diff --git a/g3doc/BUILD b/g3doc/BUILD index 24177ad06..dbbf96204 100644 --- a/g3doc/BUILD +++ b/g3doc/BUILD @@ -9,6 +9,10 @@ doc( name = "index", src = "README.md", category = "Project", + data = glob([ + "*.png", + "*.svg", + ]), permalink = "/docs/", weight = "0", ) diff --git a/g3doc/architecture_guide/Layers.png b/g3doc/Layers.png Binary files differindex 308c6c451..308c6c451 100644 --- a/g3doc/architecture_guide/Layers.png +++ b/g3doc/Layers.png diff --git a/g3doc/architecture_guide/Layers.svg b/g3doc/Layers.svg index 0a366f841..0a366f841 100644 --- a/g3doc/architecture_guide/Layers.svg +++ b/g3doc/Layers.svg diff --git a/g3doc/architecture_guide/Machine-Virtualization.png b/g3doc/Machine-Virtualization.png Binary files differindex 1ba2ed6b2..1ba2ed6b2 100644 --- a/g3doc/architecture_guide/Machine-Virtualization.png +++ b/g3doc/Machine-Virtualization.png diff --git a/g3doc/architecture_guide/Machine-Virtualization.svg b/g3doc/Machine-Virtualization.svg index 5352da07b..5352da07b 100644 --- a/g3doc/architecture_guide/Machine-Virtualization.svg +++ b/g3doc/Machine-Virtualization.svg diff --git a/g3doc/README.md b/g3doc/README.md index 7999f5d47..304a91493 100644 --- a/g3doc/README.md +++ b/g3doc/README.md @@ -1,6 +1,6 @@ # What is gVisor? -gVisor is a user-space kernel, written in Go, that implements a substantial +gVisor is an application kernel, written in Go, that implements a substantial portion of the [Linux system call interface][linux]. It provides an additional layer of isolation between running applications and the host operating system. @@ -9,19 +9,160 @@ that makes it easy to work with existing container tooling. The `runsc` runtime integrates with Docker and Kubernetes, making it simple to run sandboxed containers. -gVisor takes a distinct approach to container sandboxing and makes a different -set of technical trade-offs compared to existing sandbox technologies, thus -providing new tools and ideas for the container security landscape. - gVisor can be used with Docker, Kubernetes, or directly using `runsc`. Use the links below to see detailed instructions for each of them: -* [Docker](./user_guide/quick_start/docker/): The quickest and easiest way to - get started. -* [Kubernetes](./user_guide/quick_start/kubernetes/): Isolate Pods in your K8s - cluster with gVisor. -* [OCI Quick Start](./user_guide/quick_start/oci/): Expert mode. Customize +* [Docker](./user_guide/quick_start/docker.md): The quickest and easiest way + to get started. +* [Kubernetes](./user_guide/quick_start/kubernetes.md): Isolate Pods in your + K8s cluster with gVisor. +* [OCI Quick Start](./user_guide/quick_start/oci.md): Expert mode. Customize gVisor for your environment. +## What does gVisor do? + +gVisor provides a virtualized environment in order to sandbox containers. The +system interfaces normally implemented by the host kernel are moved into a +distinct, per-sandbox application kernel in order to minimize the risk of an +container escape exploit. gVisor does not introduce large fixed overheads +however, and still retains a process-like model with respect to resource +utilization. + +## How is this different? + +Two other approaches are commonly taken to provide stronger isolation than +native containers. + +**Machine-level virtualization**, such as [KVM][kvm] and [Xen][xen], exposes +virtualized hardware to a guest kernel via a Virtual Machine Monitor (VMM). This +virtualized hardware is generally enlightened (paravirtualized) and additional +mechanisms can be used to improve the visibility between the guest and host +(e.g. balloon drivers, paravirtualized spinlocks). Running containers in +distinct virtual machines can provide great isolation, compatibility and +performance (though nested virtualization may bring challenges in this area), +but for containers it often requires additional proxies and agents, and may +require a larger resource footprint and slower start-up times. + +![Machine-level virtualization](Machine-Virtualization.png "Machine-level virtualization") + +**Rule-based execution**, such as [seccomp][seccomp], [SELinux][selinux] and +[AppArmor][apparmor], allows the specification of a fine-grained security policy +for an application or container. These schemes typically rely on hooks +implemented inside the host kernel to enforce the rules. If the surface can be +made small enough, then this is an excellent way to sandbox applications and +maintain native performance. However, in practice it can be extremely difficult +(if not impossible) to reliably define a policy for arbitrary, previously +unknown applications, making this approach challenging to apply universally. + +![Rule-based execution](Rule-Based-Execution.png "Rule-based execution") + +Rule-based execution is often combined with additional layers for +defense-in-depth. + +**gVisor** provides a third isolation mechanism, distinct from those above. + +gVisor intercepts application system calls and acts as the guest kernel, without +the need for translation through virtualized hardware. gVisor may be thought of +as either a merged guest kernel and VMM, or as seccomp on steroids. This +architecture allows it to provide a flexible resource footprint (i.e. one based +on threads and memory mappings, not fixed guest physical resources) while also +lowering the fixed costs of virtualization. However, this comes at the price of +reduced application compatibility and higher per-system call overhead. + +![gVisor](Layers.png "gVisor") + +On top of this, gVisor employs rule-based execution to provide defense-in-depth +(details below). + +gVisor's approach is similar to [User Mode Linux (UML)][uml], although UML +virtualizes hardware internally and thus provides a fixed resource footprint. + +Each of the above approaches may excel in distinct scenarios. For example, +machine-level virtualization will face challenges achieving high density, while +gVisor may provide poor performance for system call heavy workloads. + +## Why Go? + +gVisor is written in [Go][golang] in order to avoid security pitfalls that can +plague kernels. With Go, there are strong types, built-in bounds checks, no +uninitialized variables, no use-after-free, no stack overflow, and a built-in +race detector. However, the use of Go has its challenges, and the runtime often +introduces performance overhead. + +## What are the different components? + +A gVisor sandbox consists of multiple processes. These processes collectively +comprise an environment in which one or more containers can be run. + +Each sandbox has its own isolated instance of: + +* The **Sentry**, which is a kernel that runs the containers and intercepts + and responds to system calls made by the application. + +Each container running in the sandbox has its own isolated instance of: + +* A **Gofer** which provides file system access to the containers. + +![gVisor architecture diagram](Sentry-Gofer.png "gVisor architecture diagram") + +## What is runsc? + +The entrypoint to running a sandboxed container is the `runsc` executable. +`runsc` implements the [Open Container Initiative (OCI)][oci] runtime +specification, which is used by Docker and Kubernetes. This means that OCI +compatible _filesystem bundles_ can be run by `runsc`. Filesystem bundles are +comprised of a `config.json` file containing container configuration, and a root +filesystem for the container. Please see the [OCI runtime spec][runtime-spec] +for more information on filesystem bundles. `runsc` implements multiple commands +that perform various functions such as starting, stopping, listing, and querying +the status of containers. + +### Sentry + +<a name="sentry"></a> <!-- For deep linking. --> + +The Sentry is the largest component of gVisor. It can be thought of as a +application kernel. The Sentry implements all the kernel functionality needed by +the application, including: system calls, signal delivery, memory management and +page faulting logic, the threading model, and more. + +When the application makes a system call, the +[Platform](./architecture_guide/platforms.md) redirects the call to the Sentry, +which will do the necessary work to service it. It is important to note that the +Sentry does not pass system calls through to the host kernel. As a userspace +application, the Sentry will make some host system calls to support its +operation, but it does not allow the application to directly control the system +calls it makes. For example, the Sentry is not able to open files directly; file +system operations that extend beyond the sandbox (not internal `/proc` files, +pipes, etc) are sent to the Gofer, described below. + +### Gofer + +<a name="gofer"></a> <!-- For deep linking. --> + +The Gofer is a standard host process which is started with each container and +communicates with the Sentry via the [9P protocol][9p] over a socket or shared +memory channel. The Sentry process is started in a restricted seccomp container +without access to file system resources. The Gofer mediates all access to the +these resources, providing an additional level of isolation. + +### Application + +The application is a normal Linux binary provided to gVisor in an OCI runtime +bundle. gVisor aims to provide an environment equivalent to Linux v4.4, so +applications should be able to run unmodified. However, gVisor does not +presently implement every system call, `/proc` file, or `/sys` file so some +incompatibilities may occur. See [Commpatibility](./user_guide/compatibility.md) +for more information. + +[9p]: https://en.wikipedia.org/wiki/9P_(protocol) +[apparmor]: https://wiki.ubuntu.com/AppArmor +[golang]: https://golang.org +[kvm]: https://www.linux-kvm.org [linux]: https://en.wikipedia.org/wiki/Linux_kernel_interfaces [oci]: https://www.opencontainers.org +[runtime-spec]: https://github.com/opencontainers/runtime-spec +[seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt +[selinux]: https://selinuxproject.org +[uml]: http://user-mode-linux.sourceforge.net/ +[xen]: https://www.xenproject.org diff --git a/g3doc/architecture_guide/Rule-Based-Execution.png b/g3doc/Rule-Based-Execution.png Binary files differindex b42654a90..b42654a90 100644 --- a/g3doc/architecture_guide/Rule-Based-Execution.png +++ b/g3doc/Rule-Based-Execution.png diff --git a/g3doc/architecture_guide/Rule-Based-Execution.svg b/g3doc/Rule-Based-Execution.svg index bd6717043..bd6717043 100644 --- a/g3doc/architecture_guide/Rule-Based-Execution.svg +++ b/g3doc/Rule-Based-Execution.svg diff --git a/g3doc/architecture_guide/Sentry-Gofer.png b/g3doc/Sentry-Gofer.png Binary files differindex ca2c27ef7..ca2c27ef7 100644 --- a/g3doc/architecture_guide/Sentry-Gofer.png +++ b/g3doc/Sentry-Gofer.png diff --git a/g3doc/architecture_guide/Sentry-Gofer.svg b/g3doc/Sentry-Gofer.svg index 5c10750d2..5c10750d2 100644 --- a/g3doc/architecture_guide/Sentry-Gofer.svg +++ b/g3doc/Sentry-Gofer.svg diff --git a/g3doc/architecture_guide/BUILD b/g3doc/architecture_guide/BUILD index 72038305b..404f627a4 100644 --- a/g3doc/architecture_guide/BUILD +++ b/g3doc/architecture_guide/BUILD @@ -6,30 +6,12 @@ package( ) doc( - name = "index", - src = "README.md", - category = "Architecture Guide", - data = [ - "Layers.png", - "Layers.svg", - "Machine-Virtualization.png", - "Machine-Virtualization.svg", - "Rule-Based-Execution.png", - "Rule-Based-Execution.svg", - "Sentry-Gofer.png", - "Sentry-Gofer.svg", - ], - permalink = "/docs/architecture_guide/", - weight = "0", -) - -doc( name = "platforms", src = "platforms.md", category = "Architecture Guide", data = [ - "Sentry-Gofer.png", - "Sentry-Gofer.svg", + "platforms.png", + "platforms.svg", ], permalink = "/docs/architecture_guide/platforms/", weight = "40", @@ -39,6 +21,10 @@ doc( name = "resources", src = "resources.md", category = "Architecture Guide", + data = [ + "resources.png", + "resources.svg", + ], permalink = "/docs/architecture_guide/resources/", weight = "30", ) @@ -48,8 +34,8 @@ doc( src = "security.md", category = "Architecture Guide", data = [ - "Layers.png", - "Layers.svg", + "security.png", + "security.svg", ], permalink = "/docs/architecture_guide/security/", weight = "10", diff --git a/g3doc/architecture_guide/README.md b/g3doc/architecture_guide/README.md deleted file mode 100644 index 1364a5358..000000000 --- a/g3doc/architecture_guide/README.md +++ /dev/null @@ -1,80 +0,0 @@ -# Overview - -gVisor provides a virtualized environment in order to sandbox untrusted -containers. The system interfaces normally implemented by the host kernel are -moved into a distinct, per-sandbox user space kernel in order to minimize the -risk of an exploit. gVisor does not introduce large fixed overheads however, and -still retains a process-like model with respect to resource utilization. - -## How is this different? - -Two other approaches are commonly taken to provide stronger isolation than -native containers. - -**Machine-level virtualization**, such as [KVM][kvm] and [Xen][xen], exposes -virtualized hardware to a guest kernel via a Virtual Machine Monitor (VMM). This -virtualized hardware is generally enlightened (paravirtualized) and additional -mechanisms can be used to improve the visibility between the guest and host -(e.g. balloon drivers, paravirtualized spinlocks). Running containers in -distinct virtual machines can provide great isolation, compatibility and -performance (though nested virtualization may bring challenges in this area), -but for containers it often requires additional proxies and agents, and may -require a larger resource footprint and slower start-up times. - -![Machine-level virtualization](Machine-Virtualization.png "Machine-level virtualization") - -**Rule-based execution**, such as [seccomp][seccomp], [SELinux][selinux] and -[AppArmor][apparmor], allows the specification of a fine-grained security policy -for an application or container. These schemes typically rely on hooks -implemented inside the host kernel to enforce the rules. If the surface can be -made small enough (i.e. a sufficiently complete policy defined), then this is an -excellent way to sandbox applications and maintain native performance. However, -in practice it can be extremely difficult (if not impossible) to reliably define -a policy for arbitrary, previously unknown applications, making this approach -challenging to apply universally. - -![Rule-based execution](Rule-Based-Execution.png "Rule-based execution") - -Rule-based execution is often combined with additional layers for -defense-in-depth. - -**gVisor** provides a third isolation mechanism, distinct from those above. - -gVisor intercepts application system calls and acts as the guest kernel, without -the need for translation through virtualized hardware. gVisor may be thought of -as either a merged guest kernel and VMM, or as seccomp on steroids. This -architecture allows it to provide a flexible resource footprint (i.e. one based -on threads and memory mappings, not fixed guest physical resources) while also -lowering the fixed costs of virtualization. However, this comes at the price of -reduced application compatibility and higher per-system call overhead. - -![gVisor](Layers.png "gVisor") - -On top of this, gVisor employs rule-based execution to provide defense-in-depth -(details below). - -gVisor's approach is similar to [User Mode Linux (UML)][uml], although UML -virtualizes hardware internally and thus provides a fixed resource footprint. - -Each of the above approaches may excel in distinct scenarios. For example, -machine-level virtualization will face challenges achieving high density, while -gVisor may provide poor performance for system call heavy workloads. - -### Why Go? - -gVisor is written in [Go][golang] in order to avoid security pitfalls that can -plague kernels. With Go, there are strong types, built-in bounds checks, no -uninitialized variables, no use-after-free, no stack overflow, and a built-in -race detector. (The use of Go has its challenges too, and isn't free.) - -### What about Gofers? - -<a name="gofer"></a> <!-- For deep linking. --> - -[apparmor]: https://wiki.ubuntu.com/AppArmor -[golang]: https://golang.org -[kvm]: https://www.linux-kvm.org -[seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt -[selinux]: https://selinuxproject.org -[uml]: http://user-mode-linux.sourceforge.net/ -[xen]: https://www.xenproject.org diff --git a/g3doc/architecture_guide/performance.md b/g3doc/architecture_guide/performance.md index 3862d78ee..39dbb0045 100644 --- a/g3doc/architecture_guide/performance.md +++ b/g3doc/architecture_guide/performance.md @@ -13,12 +13,13 @@ forms: additional cycles and memory usage, which may manifest as increased latency, reduced throughput or density, or not at all. In general, these costs come from two different sources. -First, the existence of the [Sentry](../) means that additional memory will be -required, and application system calls must traverse additional layers of -software. The design emphasizes [security](../security/) and therefore we chose -to use a language for the Sentry that provides benefits in this domain but may -not yet offer the raw performance of other choices. Costs imposed by these -design choices are **structural costs**. +First, the existence of the [Sentry](../README.md#sentry) means that additional +memory will be required, and application system calls must traverse additional +layers of software. The design emphasizes +[security](/docs/architecture_guide/security/) and therefore we chose to use a +language for the Sentry that provides benefits in this domain but may not yet +offer the raw performance of other choices. Costs imposed by these design +choices are **structural costs**. Second, as gVisor is an independent implementation of the system call surface, many of the subsystems or specific calls are not as optimized as more mature @@ -50,7 +51,7 @@ Virtual Machines (VMs) with the following specifications: Through this document, `runsc` is used to indicate the runtime provided by gVisor. When relevant, we use the name `runsc-platform` to describe a specific -[platform choice](../platforms/). +[platform choice](/docs/architecture_guide/platforms/). **Except where specified, all tests below are conducted with the `ptrace` platform. The `ptrace` platform works everywhere and does not require hardware @@ -131,11 +132,11 @@ full start-up and run time for the workload, which trains a model. ## System calls Some **structural costs** of gVisor are heavily influenced by the -[platform choice](../platforms/), which implements system call interception. -Today, gVisor supports a variety of platforms. These platforms present distinct -performance, compatibility and security trade-offs. For example, the KVM -platform has low overhead system call interception but runs poorly with nested -virtualization. +[platform choice](/docs/architecture_guide/platforms/), which implements system +call interception. Today, gVisor supports a variety of platforms. These +platforms present distinct performance, compatibility and security trade-offs. +For example, the KVM platform has low overhead system call interception but runs +poorly with nested virtualization. {% include graph.html id="syscall" url="/performance/syscall.csv" title="perf.py syscall --runtime=runc --runtime=runsc-ptrace --runtime=runsc-kvm" y_min="100" @@ -163,7 +164,8 @@ overhead. Some of these costs above are **structural costs**, and `redis` is likely to remain a challenging performance scenario. However, optimizing the -[platform](../platforms/) will also have a dramatic impact. +[platform](/docs/architecture_guide/platforms/) will also have a dramatic +impact. ## Start-up time @@ -184,7 +186,7 @@ similarly loads a number of modules and binds an HTTP server. > Note: most of the time overhead above is associated Docker itself. This is > evident with the empty `runc` benchmark. To avoid these costs with `runsc`, > you may also consider using `runsc do` mode or invoking the -> [OCI runtime](../../user_guide/quick_start/oci/) directly. +> [OCI runtime](../user_guide/quick_start/oci.md) directly. ## Network @@ -222,8 +224,9 @@ In terms of raw disk I/O, gVisor does not introduce significant fundamental overhead. For general file operations, gVisor introduces a small fixed overhead for data that transitions across the sandbox boundary. This manifests as **structural costs** in some cases, since these operations must be routed -through the [Gofer](../) as a result of our [security model](../security/), but -in most cases are dominated by **implementation costs**, due to an internal +through the [Gofer](../README.md#gofer) as a result of our +[Security Model](/docs/architecture_guide/security/), but in most cases are +dominated by **implementation costs**, due to an internal [Virtual File System][vfs] (VFS) implementation that needs improvement. {% include graph.html id="fio-bw" url="/performance/fio.csv" title="perf.py fio diff --git a/g3doc/architecture_guide/platforms.md b/g3doc/architecture_guide/platforms.md index 6e63da8ce..d112c9a28 100644 --- a/g3doc/architecture_guide/platforms.md +++ b/g3doc/architecture_guide/platforms.md @@ -1,86 +1,61 @@ # Platform Guide -A gVisor sandbox consists of multiple processes when running. These processes -collectively comprise a shared environment in which one or more containers can -be run. +[TOC] -Each sandbox has its own isolated instance of: - -* The **Sentry**, A user-space kernel that runs the container and intercepts - and responds to system calls made by the application. - -Each container running in the sandbox has its own isolated instance of: - -* A **Gofer** which provides file system access to the container. - -![gVisor architecture diagram](Sentry-Gofer.png "gVisor architecture diagram") - -## runsc - -The entrypoint to running a sandboxed container is the `runsc` executable. -`runsc` implements the [Open Container Initiative (OCI)][oci] runtime -specification. This means that OCI compatible _filesystem bundles_ can be run by -`runsc`. Filesystem bundles are comprised of a `config.json` file containing -container configuration, and a root filesystem for the container. Please see the -[OCI runtime spec][runtime-spec] for more information on filesystem bundles. -`runsc` implements multiple commands that perform various functions such as -starting, stopping, listing, and querying the status of containers. +gVisor requires a platform to implement interception of syscalls, basic context +switching, and memory mapping functionality. Internally, gVisor uses an +abstraction sensibly called [Platform][platform]. A simplified version of this +interface looks like: -## Sentry +```golang +type Platform interface { + NewAddressSpace() (AddressSpace, error) + NewContext() Context +} -The Sentry is the largest component of gVisor. It can be thought of as a -userspace OS kernel. The Sentry implements all the kernel functionality needed -by the untrusted application. It implements all of the supported system calls, -signal delivery, memory management and page faulting logic, the threading model, -and more. +type Context interface { + Switch(as AddressSpace, ac arch.Context) (..., error) +} -When the untrusted application makes a system call, the currently used platform -redirects the call to the Sentry, which will do the necessary work to service -it. It is important to note that the Sentry will not simply pass through system -calls to the host kernel. As a userspace application, the Sentry will make some -host system calls to support its operation, but it will not allow the -application to directly control the system calls it makes. +type AddressSpace interface { + MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, ...) error + Unmap(addr usermem.Addr, length uint64) +} +``` -The Sentry aims to present an equivalent environment to (upstream) Linux v4.4. +There are a number of different ways to implement this interface that come with +various trade-offs, generally around performance and hardware requirements. -File system operations that extend beyond the sandbox (not internal /proc files, -pipes, etc) are sent to the Gofer, described below. +## Implementations -## Platforms +The choice of platform depends on the context in which `runsc` is executing. In +general, virtualized platforms may be limited to platforms that do not require +hardware virtualized support (since the hardware is already in use): -gVisor requires a platform to implement interception of syscalls, basic context -switching, and memory mapping functionality. +![Platforms](platforms.png "Platform examples.") ### ptrace -The ptrace platform uses `PTRACE_SYSEMU` to execute user code without allowing -it to execute host system calls. This platform can run anywhere that ptrace -works (even VMs without nested virtualization). - -### KVM (experimental) +The ptrace platform uses [PTRACE_SYSEMU][ptrace] to execute user code without +allowing it to execute host system calls. This platform can run anywhere that +`ptrace` works (even VMs without nested virtualization), which is ubiquitous. -The KVM platform allows the Sentry to act as both guest OS and VMM, switching -back and forth between the two worlds seamlessly. The KVM platform can run on -bare-metal or in a VM with nested virtualization enabled. While there is no -virtualized hardware layer -- the sandbox retains a process model -- gVisor -leverages virtualization extensions available on modern processors in order to -improve isolation and performance of address space switches. +Unfortunately, the ptrace platform has high context switch overhead, so system +call-heavy applications may pay a [performance penalty](./performance.md). -## Gofer +### KVM -The Gofer is a normal host Linux process. The Gofer is started with each sandbox -and connected to the Sentry. The Sentry process is started in a restricted -seccomp container without access to file system resources. The Gofer provides -the Sentry access to file system resources via the 9P protocol and provides an -additional level of isolation. +The KVM platform uses the kernel's [KVM][kvm] functionality to allow the Sentry +to act as both guest OS and VMM. The KVM platform can run on bare-metal or in a +VM with nested virtualization enabled. While there is no virtualized hardware +layer -- the sandbox retains a process model -- gVisor leverages virtualization +extensions available on modern processors in order to improve isolation and +performance of address space switches. -## Application +## Changing Platforms -The application (aka the untrusted application) is a normal Linux binary -provided to gVisor in an OCI runtime bundle. gVisor aims to provide an -environment equivalent to Linux v4.4, so applications should be able to run -unmodified. However, gVisor does not presently implement every system call, -/proc file, or /sys file so some incompatibilities may occur. +See [Changing Platforms](../user_guide/platforms.md). -[oci]: https://www.opencontainers.org -[runtime-spec]: https://github.com/opencontainers/runtime-spec +[kvm]: https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt +[platform]: https://cs.opensource.google/gvisor/gvisor/+/release-20190304.1:pkg/sentry/platform/platform.go;l=33 +[ptrace]: http://man7.org/linux/man-pages/man2/ptrace.2.html diff --git a/g3doc/architecture_guide/platforms.png b/g3doc/architecture_guide/platforms.png Binary files differnew file mode 100644 index 000000000..005d56feb --- /dev/null +++ b/g3doc/architecture_guide/platforms.png diff --git a/g3doc/architecture_guide/platforms.svg b/g3doc/architecture_guide/platforms.svg new file mode 100644 index 000000000..b0bac9ba7 --- /dev/null +++ b/g3doc/architecture_guide/platforms.svg @@ -0,0 +1,334 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Created with Inkscape (http://www.inkscape.org/) --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="142.67763mm" + height="67.063133mm" + viewBox="0 0 142.67763 67.063134" + version="1.1" + id="svg8" + inkscape:export-filename="/home/ascannell/resources.png" + inkscape:export-xdpi="53.50127" + inkscape:export-ydpi="53.50127" + inkscape:version="0.92.4 (5da689c313, 2019-01-14)" + sodipodi:docname="platforms.svg"> + <defs + id="defs2" /> + <sodipodi:namedview + id="base" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="0.98994949" + inkscape:cx="86.443612" + inkscape:cy="102.88104" + inkscape:document-units="mm" + inkscape:current-layer="layer1" + showgrid="false" + fit-margin-top="0" + fit-margin-left="0" + fit-margin-right="0" + fit-margin-bottom="0" + inkscape:window-width="1920" + inkscape:window-height="1005" + inkscape:window-x="0" + inkscape:window-y="0" + inkscape:window-maximized="1" /> + <metadata + id="metadata5"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <g + inkscape:label="Layer 1" + inkscape:groupmode="layer" + id="layer1" + transform="translate(-36.081387,-98.953278)"> + <rect + id="rect10" + width="33.408691" + height="33.408691" + x="36.081387" + y="120.06757" + style="fill:#44aa00;stroke-width:0.26458332" /> + <rect + style="fill:#b3b3b3;stroke-width:0.23881446" + id="rect16" + width="142.45465" + height="10.423517" + x="36.08139" + y="155.5929" /> + <rect + id="rect10-7" + width="30.52453" + height="18.976137" + x="37.416695" + y="121.65508" + style="fill:#ff8080;stroke-width:0.19060372" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" + x="41.03727" + y="148.58765" + id="text65"><tspan + sodipodi:role="line" + id="tspan63" + x="41.03727" + y="148.58765" + style="stroke-width:0.08507314">gVisor</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" + x="45.473087" + y="132.50232" + id="text123"><tspan + sodipodi:role="line" + id="tspan121" + x="45.473087" + y="132.50232" + style="stroke-width:0.08327847">workload</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:6.43922186px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.16098055" + x="97.768547" + y="163.15665" + id="text163"><tspan + sodipodi:role="line" + id="tspan161" + x="97.768547" + y="163.15665" + style="stroke-width:0.16098055">host</tspan></text> + <rect + style="fill:#e9afdd;stroke-width:0.39185274" + id="rect16-7" + width="72.9646" + height="54.79026" + x="105.79441" + y="98.953278" /> + <rect + id="rect10-5" + width="33.408691" + height="33.408691" + x="108.24348" + y="100.53072" + style="fill:#44aa00;stroke-width:0.26458332" /> + <rect + id="rect10-7-6" + width="30.52453" + height="20.045216" + x="109.57877" + y="102.11823" + style="fill:#ff8080;stroke-width:0.19589928" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" + x="112.86765" + y="129.01863" + id="text65-2"><tspan + sodipodi:role="line" + id="tspan63-9" + x="112.86765" + y="129.01863" + style="stroke-width:0.08507314">gVisor</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" + x="117.63519" + y="114.02371" + id="text123-1"><tspan + sodipodi:role="line" + id="tspan121-2" + x="117.63519" + y="114.02371" + style="stroke-width:0.08327847">workload</tspan></text> + <rect + id="rect10-7-7" + width="11.815663" + height="8.0126781" + x="54.538059" + y="143.27702" + style="fill:#aaccff;stroke-width:0.07705856" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:4.35074377px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.10876859" + x="55.931114" + y="148.90578" + id="text144"><tspan + sodipodi:role="line" + id="tspan142" + x="55.931114" + y="148.90578" + style="stroke-width:0.10876859">KVM</tspan></text> + <rect + id="rect10-6" + width="33.408691" + height="33.408691" + x="71.044685" + y="119.73112" + style="fill:#44aa00;stroke-width:0.26458332" /> + <rect + id="rect10-7-0" + width="30.52453" + height="18.976137" + x="72.37999" + y="121.31865" + style="fill:#ff8080;stroke-width:0.19060372" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" + x="76.000565" + y="148.25128" + id="text65-6"><tspan + sodipodi:role="line" + id="tspan63-2" + x="76.000565" + y="148.25128" + style="stroke-width:0.08507314">gVisor</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" + x="80.436386" + y="132.16595" + id="text123-6"><tspan + sodipodi:role="line" + id="tspan121-1" + x="80.436386" + y="132.16595" + style="stroke-width:0.08327847">workload</tspan></text> + <rect + id="rect10-7-7-8" + width="11.815664" + height="8.0126781" + x="89.501358" + y="142.94067" + style="fill:#ffeeaa;stroke-width:0.07705856" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416" + x="89.92292" + y="147.89806" + id="text144-7"><tspan + sodipodi:role="line" + id="tspan142-9" + x="89.92292" + y="147.89806" + style="stroke-width:0.08486416">ptrace</tspan></text> + <rect + id="rect10-7-7-8-3" + width="11.815665" + height="8.0126781" + x="127.08897" + y="123.97878" + style="fill:#ffeeaa;stroke-width:0.07705856" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416" + x="127.51052" + y="128.9362" + id="text144-7-7"><tspan + sodipodi:role="line" + id="tspan142-9-5" + x="127.51052" + y="128.9362" + style="stroke-width:0.08486416">ptrace</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:5.45061255px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.13626531" + x="138.49318" + y="152.11841" + id="text229"><tspan + sodipodi:role="line" + id="tspan227" + x="138.49318" + y="152.11841" + style="stroke-width:0.13626531">VM</tspan></text> + <rect + style="fill:#b3b3b3;stroke-width:0.16518368" + id="rect16-9" + width="68.15374" + height="10.423517" + x="108.24348" + y="134.99774" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:6.17854786px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.15446369" + x="132.91473" + y="142.07658" + id="text248"><tspan + sodipodi:role="line" + id="tspan246" + x="132.91473" + y="142.07658" + style="stroke-width:0.15446369">guest</tspan></text> + <rect + id="rect10-5-2" + width="33.408691" + height="33.408691" + x="143.32402" + y="100.35877" + style="fill:#44aa00;stroke-width:0.26458332" /> + <rect + id="rect10-7-6-2" + width="30.52453" + height="20.045216" + x="144.65933" + y="101.94627" + style="fill:#ff8080;stroke-width:0.19589929" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" + x="147.94815" + y="128.84665" + id="text65-2-8"><tspan + sodipodi:role="line" + id="tspan63-9-9" + x="147.94815" + y="128.84665" + style="stroke-width:0.08507314">gVisor</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" + x="152.71565" + y="113.85176" + id="text123-1-7"><tspan + sodipodi:role="line" + id="tspan121-2-3" + x="152.71565" + y="113.85176" + style="stroke-width:0.08327847">workload</tspan></text> + <rect + id="rect10-7-7-8-3-6" + width="11.815666" + height="8.0126781" + x="162.16933" + y="123.80682" + style="fill:#ffeeaa;stroke-width:0.07705856" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416" + x="162.59088" + y="128.76421" + id="text144-7-7-1"><tspan + sodipodi:role="line" + id="tspan142-9-5-2" + x="162.59088" + y="128.76421" + style="stroke-width:0.08486416">ptrace</tspan></text> + </g> +</svg> diff --git a/g3doc/architecture_guide/resources.md b/g3doc/architecture_guide/resources.md index 894f995ae..1dec37bd1 100644 --- a/g3doc/architecture_guide/resources.md +++ b/g3doc/architecture_guide/resources.md @@ -10,9 +10,10 @@ sandbox to be highly dynamic in terms of resource usage: spanning a large number of cores and large amount of memory when busy, and yielding those resources back to the host when not. -Some of the details here may depend on the [platform](../platforms/), but in -general this page describes the resource model used by gVisor. If you're not -familiar with the terms here, uou may want to start with the [Overview](../). +In order words, the shape of the sandbox should closely track the shape of the +sandboxed process: + +![Resource model](resources.png "Workloads of different shapes.") ## Processes @@ -23,9 +24,9 @@ the sandbox (e.g. via a [Docker exec][exec]). ## Networking -Similarly to processes, the sandbox attaches a network endpoint to the system, -but runs it's own network stack. All network resources, other than packets in -flight, exist only inside the sandbox, bound by relevant resource limits. +The sandbox attaches a network endpoint to the system, but runs it's own network +stack. All network resources, other than packets in flight on the host, exist +only inside the sandbox, bound by relevant resource limits. You can interact with network endpoints exposed by the sandbox, just as you would any other container, but network introspection similarly requires entering @@ -33,15 +34,14 @@ the sandbox. ## Files -Files may be backed by different implementations. For host-native files (where a -file descriptor is available), the Gofer may return a file descriptor to the -Sentry via [SCM_RIGHTS][scmrights][^1]. +Files in the sandbox may be backed by different implementations. For host-native +files (where a file descriptor is available), the Gofer may return a file +descriptor to the Sentry via [SCM_RIGHTS][scmrights][^1]. These files may be read from and written to through standard system calls, and also mapped into the associated application's address space. This allows the same host memory to be shared across multiple sandboxes, although this mechanism -does not preclude the use of side-channels (see the -[security model](../security/)). +does not preclude the use of side-channels (see [Security Model](./security.md). Note that some file systems exist only within the context of the sandbox. For example, in many cases a `tmpfs` mount will be available at `/tmp` or @@ -64,8 +64,9 @@ scheduling decisions about all application threads. ## Time Time in the sandbox is provided by the Sentry, through its own [vDSO][vdso] and -timekeeping implementation. This is divorced from the host time, and no state is -shared with the host, although the time will be initialized with the host clock. +time-keeping implementation. This is distinct from the host time, and no state +is shared with the host, although the time will be initialized with the host +clock. The Sentry runs timers to note the passage of time, much like a kernel running on hardware (though the timers are software timers, in this case). These timers diff --git a/g3doc/architecture_guide/resources.png b/g3doc/architecture_guide/resources.png Binary files differnew file mode 100644 index 000000000..f715008ec --- /dev/null +++ b/g3doc/architecture_guide/resources.png diff --git a/g3doc/architecture_guide/resources.svg b/g3doc/architecture_guide/resources.svg new file mode 100644 index 000000000..fd7805d90 --- /dev/null +++ b/g3doc/architecture_guide/resources.svg @@ -0,0 +1,208 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Created with Inkscape (http://www.inkscape.org/) --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="108.24417mm" + height="47.513165mm" + viewBox="0 0 108.24417 47.513165" + version="1.1" + id="svg8" + inkscape:export-filename="/home/ascannell/resources.png" + inkscape:export-xdpi="53.50127" + inkscape:export-ydpi="53.50127" + inkscape:version="0.92.4 (5da689c313, 2019-01-14)" + sodipodi:docname="resources.svg"> + <defs + id="defs2" /> + <sodipodi:namedview + id="base" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="0.98994949" + inkscape:cx="16.897058" + inkscape:cy="41.261746" + inkscape:document-units="mm" + inkscape:current-layer="layer1" + showgrid="false" + fit-margin-top="0" + fit-margin-left="0" + fit-margin-right="0" + fit-margin-bottom="0" + inkscape:window-width="1920" + inkscape:window-height="1005" + inkscape:window-x="0" + inkscape:window-y="0" + inkscape:window-maximized="1" /> + <metadata + id="metadata5"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <g + inkscape:label="Layer 1" + inkscape:groupmode="layer" + id="layer1" + transform="translate(-36.081387,-118.50325)"> + <rect + id="rect10" + width="33.408691" + height="33.408691" + x="36.081387" + y="120.06757" + style="fill:#44aa00;stroke-width:0.26458332" /> + <circle + style="fill:#44aa00;stroke-width:0.21849461" + id="path12" + cx="87.958534" + cy="136.63828" + r="17.105247" /> + <path + sodipodi:type="star" + style="fill:#44aa00;stroke-width:0.26458332" + id="path14" + sodipodi:sides="3" + sodipodi:cx="124.13387" + sodipodi:cy="141.81859" + sodipodi:r1="23.31534" + sodipodi:r2="11.65767" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="false" + inkscape:rounded="0" + inkscape:randomized="0" + d="m 144.32555,153.47626 -20.19168,0 -20.19167,0 10.09583,-17.48651 10.09584,-17.4865 10.09584,17.4865 z" + inkscape:transform-center-x="1.8384776e-06" + inkscape:transform-center-y="-5.8288369" /> + <rect + style="fill:#b3b3b3;stroke-width:0.20817307" + id="rect16" + width="108.24416" + height="10.423517" + x="36.08139" + y="155.5929" /> + <path + sodipodi:type="star" + style="fill:#ff8080;stroke-width:0.20018946" + id="path14-3" + sodipodi:sides="3" + sodipodi:cx="124.13387" + sodipodi:cy="139.31911" + sodipodi:r1="17.640888" + sodipodi:r2="8.8204451" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="false" + inkscape:rounded="0" + inkscape:randomized="0" + d="m 139.41133,148.13955 -15.27746,0 -15.27745,0 7.63872,-13.23067 7.63873,-13.23066 7.63873,13.23066 z" + inkscape:transform-center-x="3.9117172e-06" + inkscape:transform-center-y="-4.4102243" /> + <circle + style="fill:#ff8080;stroke-width:0.18094084" + id="path12-6" + cx="87.93705" + cy="134.75125" + r="14.165282" /> + <rect + id="rect10-7" + width="30.52453" + height="25.657875" + x="37.416695" + y="121.65508" + style="fill:#ff8080;stroke-width:0.22163473" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" + x="47.387276" + y="151.7626" + id="text65"><tspan + sodipodi:role="line" + id="tspan63" + x="47.387276" + y="151.7626" + style="stroke-width:0.08507314">gVisor</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" + x="82.156319" + y="151.71547" + id="text65-5"><tspan + sodipodi:role="line" + id="tspan63-3" + x="82.156319" + y="151.71547" + style="stroke-width:0.08507314">gVisor</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" + x="118.66879" + y="151.71547" + id="text65-5-5"><tspan + sodipodi:role="line" + id="tspan63-3-6" + x="118.66879" + y="151.71547" + style="stroke-width:0.08507314">gVisor</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" + x="45.473087" + y="136.20644" + id="text123"><tspan + sodipodi:role="line" + id="tspan121" + x="45.473087" + y="136.20644" + style="stroke-width:0.08327847">workload</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" + x="80.153076" + y="136.00925" + id="text123-1"><tspan + sodipodi:role="line" + id="tspan121-2" + x="80.153076" + y="136.00925" + style="stroke-width:0.08327847">workload</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" + x="116.50173" + y="138.68195" + id="text123-1-7"><tspan + sodipodi:role="line" + id="tspan121-2-0" + x="116.50173" + y="138.68195" + style="stroke-width:0.08327847">workload</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:6.43922186px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.16098055" + x="81.893562" + y="163.15665" + id="text163"><tspan + sodipodi:role="line" + id="tspan161" + x="81.893562" + y="163.15665" + style="stroke-width:0.16098055">host</tspan></text> + </g> +</svg> diff --git a/g3doc/architecture_guide/security.md b/g3doc/architecture_guide/security.md index f78586291..b99b86332 100644 --- a/g3doc/architecture_guide/security.md +++ b/g3doc/architecture_guide/security.md @@ -86,15 +86,17 @@ a substitute for a secure architecture*. ## Goals: Limiting Exposure -gVisor’s primary design goal is to minimize the System API attack vector while -still providing a process model. There are two primary security principles that -inform this design. First, the application’s direct interactions with the host -System API are intercepted by the Sentry, which implements the System API -instead. Second, the System API accessible to the Sentry itself is minimized to -a safer, restricted set. The first principle minimizes the possibility of direct -exploitation of the host System API by applications, and the second principle -minimizes indirect exploitability, which is the exploitation by an exploited or -buggy Sentry (e.g. chaining an exploit). +![Threat model](security.png "Threat model.") + +gVisor’s primary design goal is to minimize the System API attack vector through +multiple layers of defense, while still providing a process model. There are two +primary security principles that inform this design. First, the application’s +direct interactions with the host System API are intercepted by the Sentry, +which implements the System API instead. Second, the System API accessible to +the Sentry itself is minimized to a safer, restricted set. The first principle +minimizes the possibility of direct exploitation of the host System API by +applications, and the second principle minimizes indirect exploitability, which +is the exploitation by an exploited or buggy Sentry (e.g. chaining an exploit). The first principle is similar to the security basis for a Virtual Machine (VM). With a VM, an application’s interactions with the host are replaced by @@ -210,9 +212,9 @@ crashes are recorded and triaged to similarly identify material issues. ### Is this more or less secure than a Virtual Machine? The security of a VM depends to a large extent on what is exposed from the host -kernel and user space support code. For example, device emulation code in the +kernel and userspace support code. For example, device emulation code in the host kernel (e.g. APIC) or optimizations (e.g. vhost) can be more complex than a -simple system call, and exploits carry the same risks. Similarly, the user space +simple system call, and exploits carry the same risks. Similarly, the userspace support code is frequently unsandboxed, and exploits, while rare, may allow unfettered access to the system. @@ -245,8 +247,8 @@ In gVisor, the platforms that use ptrace operate differently. The stubs that are traced are never allowed to continue execution into the host kernel and complete a call directly. Instead, all system calls are interpreted and handled by the Sentry itself, who reflects resulting register state back into the tracee before -continuing execution in user space. This is very similar to the mechanism used -by User-Mode Linux (UML). +continuing execution in userspace. This is very similar to the mechanism used by +User-Mode Linux (UML). [dirtycow]: https://en.wikipedia.org/wiki/Dirty_COW [clang]: https://en.wikipedia.org/wiki/C_(programming_language) diff --git a/g3doc/architecture_guide/security.png b/g3doc/architecture_guide/security.png Binary files differnew file mode 100644 index 000000000..c29befbf6 --- /dev/null +++ b/g3doc/architecture_guide/security.png diff --git a/g3doc/architecture_guide/security.svg b/g3doc/architecture_guide/security.svg new file mode 100644 index 000000000..0575e2dec --- /dev/null +++ b/g3doc/architecture_guide/security.svg @@ -0,0 +1,153 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Created with Inkscape (http://www.inkscape.org/) --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="92.963379mm" + height="107.18885mm" + viewBox="0 0 92.963379 107.18885" + version="1.1" + id="svg8" + inkscape:version="0.92.4 (5da689c313, 2019-01-14)" + sodipodi:docname="defense.svg"> + <defs + id="defs2" /> + <sodipodi:namedview + id="base" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="0.98994949" + inkscape:cx="-242.99254" + inkscape:cy="136.90181" + inkscape:document-units="mm" + inkscape:current-layer="layer4" + showgrid="false" + inkscape:object-nodes="true" + inkscape:window-width="1920" + inkscape:window-height="1005" + inkscape:window-x="0" + inkscape:window-y="0" + inkscape:window-maximized="1" + fit-margin-top="0" + fit-margin-left="0" + fit-margin-right="0" + fit-margin-bottom="0" /> + <metadata + id="metadata5"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <g + inkscape:groupmode="layer" + id="layer2" + inkscape:label="Layer 2" + transform="translate(-61.112559,-78.160466)"> + <g + id="g4644" + style="fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576" + transform="matrix(1,0,0,-1,2.138671,277.94235)"> + <path + transform="scale(0.26458333)" + inkscape:connector-curvature="0" + style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:3.77952766;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576" + d="M 398.57227,351.84766 224.7832,452.18359 398.57227,552.51953 572.35938,452.18359 Z" + id="path4638" /> + <path + inkscape:connector-curvature="0" + style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:3.77952766;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576" + d="M 572.35938,452.18359 398.57227,552.51953 V 753.19141 L 572.35938,652.85547 Z" + transform="scale(0.26458333)" + id="path4640" /> + <path + id="path4642" + d="m 59.473888,119.64024 45.981172,26.54722 v 53.09443 L 59.473888,172.73467 Z" + style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576" + inkscape:connector-curvature="0" /> + </g> + </g> + <g + inkscape:groupmode="layer" + id="layer3" + inkscape:label="Layer 3" + transform="translate(-61.112559,-78.160466)"> + <g + id="g4554" + transform="matrix(-0.39771468,0.69855937,-0.69855937,-0.39771468,366.58103,126.65261)"> + <g + id="g4662" + transform="translate(59.46839,130.66062)"> + <path + inkscape:connector-curvature="0" + id="path4548" + transform="scale(0.26458333)" + d="M 398.57227,351.84766 224.7832,452.18359 398.57227,552.51953 572.35938,452.18359 Z" + style="opacity:1;fill:#0066ff;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:4.70182848;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + <path + inkscape:connector-curvature="0" + id="path4550" + transform="scale(0.26458333)" + d="M 572.35938,452.18359 398.57227,552.51953 V 753.19141 L 572.35938,652.85547 Z" + style="opacity:1;fill:#0044aa;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:4.29276943;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + <path + inkscape:connector-curvature="0" + style="opacity:1;fill:#5599ff;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:1.24402535;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + d="m 59.473888,119.64024 45.981172,26.54722 v 53.09443 L 59.473888,172.73467 Z" + id="path4552" /> + </g> + </g> + </g> + <g + inkscape:groupmode="layer" + id="layer4" + inkscape:label="Layer 4" + transform="translate(-61.112559,-78.160466)"> + <path + style="fill:#e000ae;fill-opacity:1;stroke-width:0.12476727" + d="m 84.610811,107.36071 v 2.55773 2.55772 h 2.49535 2.49534 v -2.55772 -2.55773 h -2.49534 z m 40.674129,0 v 2.55773 2.55772 h 2.49535 2.49534 v -2.55772 -2.55773 h -2.49534 z m -35.558669,5.11545 v 2.55773 2.55773 h 2.49535 2.49534 v -2.55773 -2.55773 h -2.49534 z m 4.99069,5.11546 v 2.55773 2.55773 h -2.49534 -2.49535 v 2.49534 2.49535 h -2.55773 -2.55773 v 2.55773 2.55773 h -2.55773 -2.55773 v 10.16853 10.16853 h 2.55773 2.55773 v -7.67562 -7.67587 l 2.52654,0.0339 2.52654,0.0336 0.0327,5.08427 0.0327,5.08426 h 2.49388 2.49388 v 2.55919 2.5592 l 5.08427,-0.0327 5.084269,-0.0326 v -2.49534 -2.49535 l -5.084269,-0.0324 -5.08427,-0.0327 v -2.55626 -2.55651 h 12.726269 12.72626 v 2.55651 2.55626 l -5.05868,0.0327 -5.05893,0.0324 v 2.49535 2.49534 l 5.05893,0.0326 5.05868,0.0327 v -2.55919 -2.55919 h 2.49388 2.49413 l 0.0324,-5.08426 0.0327,-5.08427 2.52653,-0.0336 2.52654,-0.0339 v 7.67586 7.67563 h 2.55773 2.55773 v -10.16854 -10.16853 h -2.55773 -2.55773 v -2.55773 -2.55773 h -2.55773 -2.55773 v -2.49535 -2.49534 h -2.49535 -2.49534 v -2.55773 -2.55773 h -2.55773 -2.55773 v 2.55773 2.55773 h -7.6108 -7.610809 v -2.55773 -2.55773 h -2.55774 z m 25.452519,0 h 2.49535 2.49535 v -2.55773 -2.55773 h -2.49535 -2.49535 v 2.55773 z m -25.452519,10.10615 h 5.11546 5.115459 v 2.55773 2.55773 h -5.115459 -5.11546 v -2.55773 z m 15.221609,0 h 5.11546 5.11545 v 2.55773 2.55773 h -5.11545 -5.11546 v -2.55773 z" + id="path4732" + inkscape:connector-curvature="0" /> + </g> + <g + inkscape:label="Layer 1" + inkscape:groupmode="layer" + id="layer1" + style="display:inline" + transform="translate(-61.112559,-78.160466)"> + <g + transform="translate(-131.49557,42.495842)" + style="fill:#007200;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + id="g4628"> + <path + id="path4529" + d="m 239.09034,36.164616 -45.98169,26.547215 45.98169,26.547217 45.98117,-26.547217 z" + style="opacity:1;fill:#4aba19;fill-opacity:0.34509804;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + inkscape:connector-curvature="0" /> + <path + id="path4531" + d="m 285.07151,62.711828 -45.98117,26.54722 v 53.094432 l 45.98117,-26.54722 z" + style="opacity:1;fill:#007900;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + inkscape:connector-curvature="0" /> + <path + inkscape:connector-curvature="0" + style="opacity:1;fill:#003d00;fill-opacity:0.34509804;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + d="m 193.10865,62.711831 45.98117,26.54722 v 53.094429 l -45.98117,-26.54722 z" + id="path4541" /> + </g> + </g> +</svg> diff --git a/g3doc/user_guide/filesystem.md b/g3doc/user_guide/filesystem.md index 6c69f42a1..cd00762dd 100644 --- a/g3doc/user_guide/filesystem.md +++ b/g3doc/user_guide/filesystem.md @@ -4,8 +4,8 @@ gVisor accesses the filesystem through a file proxy, called the Gofer. The gofer runs as a separate process, that is isolated from the sandbox. Gofer instances -communicate with their respective sentry using the 9P protocol. For a more -detailed explanation see [Overview > Gofer](../../architecture_guide/#gofer). +communicate with their respective sentry using the 9P protocol. For another +explanation see [What is gVisor?](../README.md). ## Sandbox overlay diff --git a/g3doc/user_guide/install.md b/g3doc/user_guide/install.md index 0de2b9932..9afdd264d 100644 --- a/g3doc/user_guide/install.md +++ b/g3doc/user_guide/install.md @@ -150,11 +150,8 @@ users, and ensure it is executable by all users**, since `runsc` executes itself as user `nobody` to avoid unnecessary privileges. The `/usr/local/bin` directory is a good place to put the `runsc` binary. -After installation, the`runsc` binary comes with an `install` command that can -optionally automatically configure Docker: - -```bash -runsc install -``` +After installation, try out `runsc` by following the +[Docker Quick Start](./quick_start/docker.md) or +[OCI Quick Start](./quick_start/oci.md). [releases]: https://github.com/google/gvisor/releases diff --git a/g3doc/user_guide/platforms.md b/g3doc/user_guide/platforms.md index eefb6b222..752025881 100644 --- a/g3doc/user_guide/platforms.md +++ b/g3doc/user_guide/platforms.md @@ -1,56 +1,27 @@ -# Platforms (KVM) +# Changing Platforms [TOC] -This document will help you set up your system to use a different gVisor -platform. +This guide described how to change the +[platform](../architecture_guide/platforms.md) used by `runsc`. -## What is a Platform? +## Prerequisites -gVisor requires a *platform* to implement interception of syscalls, basic -context switching, and memory mapping functionality. These are described in more -depth in the [Platform Design](../../architecture_guide/platforms/). +If you intend to run the KVM platform, you will also to have KVM installed on +your system. If you are running a Debian based system like Debian or Ubuntu you +can usually do this by ensuring the module is loaded, and permissions are +appropriately set on the `/dev/kvm` device. -## Selecting a Platform - -The platform is selected by the `--platform` command line flag passed to -`runsc`. By default, the ptrace platform is selected. To select a different -platform, modify your Docker configuration (`/etc/docker/daemon.json`) to pass -this argument: - -```json -{ - "runtimes": { - "runsc": { - "path": "/usr/local/bin/runsc", - "runtimeArgs": [ - "--platform=kvm" - ] - } - } -} -``` - -You must restart the Docker daemon after making changes to this file, typically -this is done via `systemd`: +If you have an Intel CPU: ```bash -sudo systemctl restart docker +sudo modprobe kvm-intel && sudo chmod a+rw /dev/kvm ``` -## Example: Using the KVM Platform - -The KVM platform is currently experimental; however, it provides several -benefits over the default ptrace platform. - -### Prerequisites - -You will also to have KVM installed on your system. If you are running a Debian -based system like Debian or Ubuntu you can usually do this by installing the -`qemu-kvm` package. +If you have an AMD CPU: ```bash -sudo apt-get install qemu-kvm +sudo modprobe kvm-amd && sudo chmod a+rw /dev/kvm ``` If you are using a virtual machine you will need to make sure that nested @@ -68,31 +39,22 @@ cause of security issues (e.g. [CVE-2018-12904](https://nvd.nist.gov/vuln/detail/CVE-2018-12904)). It is not recommended for production.*** -### Configuring Docker - -Per above, you will need to configure Docker to use `runsc` with the KVM -platform. You will remember from the Docker Quick Start that you configured -Docker to use `runsc` as the runtime. Docker allows you to add multiple runtimes -to the Docker configuration. +## Configuring Docker -Add a new entry for the KVM platform entry to your Docker configuration -(`/etc/docker/daemon.json`) in order to provide the `--platform=kvm` runtime -argument. - -In the end, the file should look something like: +The platform is selected by the `--platform` command line flag passed to +`runsc`. By default, the ptrace platform is selected. For example, to select the +KVM platform, modify your Docker configuration (`/etc/docker/daemon.json`) to +pass the `--platform` argument: ```json { "runtimes": { "runsc": { - "path": "/usr/local/bin/runsc" - }, - "runsc-kvm": { "path": "/usr/local/bin/runsc", "runtimeArgs": [ "--platform=kvm" ] - } + } } } ``` @@ -104,13 +66,27 @@ this is done via `systemd`: sudo systemctl restart docker ``` -## Running a container +Note that you may configure multiple runtimes using different platforms. For +example, the following configuration has one configuration for ptrace and one +for the KVM platform: -Now run your container using the `runsc-kvm` runtime. This will run the -container using the KVM platform: - -```bash -docker run --runtime=runsc-kvm --rm hello-world +```json +{ + "runtimes": { + "runsc-ptrace": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--platform=ptrace" + ] + }, + "runsc-kvm": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--platform=kvm" + ] + } + } +} ``` [nested-azure]: https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nested-virtualization diff --git a/g3doc/user_guide/quick_start/docker.md b/g3doc/user_guide/quick_start/docker.md index 5228db4c0..6ad594ecc 100644 --- a/g3doc/user_guide/quick_start/docker.md +++ b/g3doc/user_guide/quick_start/docker.md @@ -1,4 +1,4 @@ -# Docker +# Docker Quick Start > Note: This guide requires Docker version 17.09.0 or greater. Refer to the > [Docker documentation][docker] for how to install it. @@ -14,24 +14,28 @@ the next section and proceed straight to running a container. ## Configuring Docker First you will need to configure Docker to use `runsc` by adding a runtime entry -to your Docker configuration (`/etc/docker/daemon.json`). You may have to create -this file if it does not exist. Also, some Docker versions also require you to -[specify the `storage-driver` field][storage-driver]. - -In the end, the file should look something like: - -```json -{ - "runtimes": { - "runsc": { - "path": "/usr/local/bin/runsc" - } - } -} +to your Docker configuration (e.g. `/etc/docker/daemon.json`). The easiest way +to this is via the `runsc install` command. This will install a docker runtime +named "runsc" by default. + +```bash +sudo runsc install +``` + +You may also wish to install a runtime entry for debugging. The `runsc install` +command can accept options that will be passed to the runtime when it is invoked +by Docker. + +```bash +sudo runsc install --runtime runsc-debug -- \ + --debug \ + --debug-log=/tmp/runsc-debug.log \ + --strace \ + --log-packets ``` -You must restart the Docker daemon after making changes to this file, typically -this is done via `systemd`: +You must restart the Docker daemon after installing the runtime. Typically this +is done via `systemd`: ```bash sudo systemctl restart docker diff --git a/g3doc/user_guide/quick_start/kubernetes.md b/g3doc/user_guide/quick_start/kubernetes.md index b1f67252e..f875d8002 100644 --- a/g3doc/user_guide/quick_start/kubernetes.md +++ b/g3doc/user_guide/quick_start/kubernetes.md @@ -1,4 +1,4 @@ -# Kubernetes +# Kubernetes Quick Start gVisor can be used to run Kubernetes pods and has several integration points with Kubernetes. diff --git a/g3doc/user_guide/quick_start/oci.md b/g3doc/user_guide/quick_start/oci.md index 57bcc4f63..877169145 100644 --- a/g3doc/user_guide/quick_start/oci.md +++ b/g3doc/user_guide/quick_start/oci.md @@ -1,4 +1,4 @@ -# OCI +# OCI Quick Start This guide will quickly get you started running your first gVisor sandbox container using the runtime directly with the default platform. diff --git a/g3doc/user_guide/tutorials/docker.md b/g3doc/user_guide/tutorials/docker.md index c0a3db506..705560038 100644 --- a/g3doc/user_guide/tutorials/docker.md +++ b/g3doc/user_guide/tutorials/docker.md @@ -1,4 +1,4 @@ -# WorkPress with Docker +# WordPress with Docker This page shows you how to deploy a sample [WordPress][wordpress] site using [Docker][docker]. diff --git a/images/tmpfile/Dockerfile b/images/tmpfile/Dockerfile new file mode 100644 index 000000000..e3816c8cb --- /dev/null +++ b/images/tmpfile/Dockerfile @@ -0,0 +1,4 @@ +# Create file under /tmp to ensure files inside '/tmp' are not overridden. +FROM alpine:3.11.5 +RUN mkdir -p /tmp/foo \ + && echo 123 > /tmp/foo/file.txt diff --git a/pkg/goid/BUILD b/pkg/goid/BUILD index ea8d2422c..7a82631c5 100644 --- a/pkg/goid/BUILD +++ b/pkg/goid/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "goid.go", "goid_amd64.s", + "goid_arm64.s", "goid_race.go", "goid_unsafe.go", ], diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/goid/goid_arm64.s index 847cb0784..a7465b75d 100644 --- a/pkg/sentry/fsimpl/gofer/pagemath.go +++ b/pkg/goid/goid_arm64.s @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor Authors. +// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,20 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -package gofer +#include "textflag.h" -import ( - "gvisor.dev/gvisor/pkg/usermem" -) - -// This are equivalent to usermem.Addr.RoundDown/Up, but without the -// potentially truncating conversion to usermem.Addr. This is necessary because -// there is no way to define generic "PageRoundDown/Up" functions in Go. - -func pageRoundDown(x uint64) uint64 { - return x &^ (usermem.PageSize - 1) -} - -func pageRoundUp(x uint64) uint64 { - return pageRoundDown(x + usermem.PageSize - 1) -} +// func getg() *g +TEXT ·getg(SB),NOSPLIT,$0-8 + MOVD g, R0 // g + MOVD R0, ret+0(FP) + RET diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD index 41bf104d0..f84d03700 100644 --- a/pkg/linewriter/BUILD +++ b/pkg/linewriter/BUILD @@ -5,6 +5,8 @@ package(licenses = ["notice"]) go_library( name = "linewriter", srcs = ["linewriter.go"], + marshal = False, + stateify = False, visibility = ["//visibility:public"], deps = ["//pkg/sync"], ) diff --git a/pkg/log/BUILD b/pkg/log/BUILD index a7c8f7bef..3ed6aba5c 100644 --- a/pkg/log/BUILD +++ b/pkg/log/BUILD @@ -10,6 +10,8 @@ go_library( "json_k8s.go", "log.go", ], + marshal = False, + stateify = False, visibility = [ "//visibility:public", ], diff --git a/pkg/segment/BUILD b/pkg/segment/BUILD index 1b487b887..f57ccc170 100644 --- a/pkg/segment/BUILD +++ b/pkg/segment/BUILD @@ -21,6 +21,8 @@ go_template( ], opt_consts = [ "minDegree", + # trackGaps must either be 0 or 1. + "trackGaps", ], types = [ "Key", diff --git a/pkg/segment/set.go b/pkg/segment/set.go index 03e4f258f..1a17ad9cb 100644 --- a/pkg/segment/set.go +++ b/pkg/segment/set.go @@ -36,6 +36,34 @@ type Range interface{} // Value is a required type parameter. type Value interface{} +// trackGaps is an optional parameter. +// +// If trackGaps is 1, the Set will track maximum gap size recursively, +// enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this +// case, Key must be an unsigned integer. +// +// trackGaps must be 0 or 1. +const trackGaps = 0 + +var _ = uint8(trackGaps << 7) // Will fail if not zero or one. + +// dynamicGap is a type that disappears if trackGaps is 0. +type dynamicGap [trackGaps]Key + +// Get returns the value of the gap. +// +// Precondition: trackGaps must be non-zero. +func (d *dynamicGap) Get() Key { + return d[:][0] +} + +// Set sets the value of the gap. +// +// Precondition: trackGaps must be non-zero. +func (d *dynamicGap) Set(v Key) { + d[:][0] = v +} + // Functions is a required type parameter that must be a struct implementing // the methods defined by Functions. type Functions interface { @@ -327,8 +355,12 @@ func (s *Set) Insert(gap GapIterator, r Range, val Value) Iterator { } if prev.Ok() && prev.End() == r.Start { if mval, ok := (Functions{}).Merge(prev.Range(), prev.Value(), r, val); ok { + shrinkMaxGap := trackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) + if shrinkMaxGap { + gap.node.updateMaxGapLeaf() + } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (Functions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { @@ -342,11 +374,16 @@ func (s *Set) Insert(gap GapIterator, r Range, val Value) Iterator { } if next.Ok() && next.Start() == r.End { if mval, ok := (Functions{}).Merge(r, val, next.Range(), next.Value()); ok { + shrinkMaxGap := trackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) + if shrinkMaxGap { + gap.node.updateMaxGapLeaf() + } return next } } + // InsertWithoutMergingUnchecked will maintain maxGap if necessary. return s.InsertWithoutMergingUnchecked(gap, r, val) } @@ -373,11 +410,15 @@ func (s *Set) InsertWithoutMerging(gap GapIterator, r Range, val Value) Iterator // Preconditions: r.Start >= gap.Start(); r.End <= gap.End(). func (s *Set) InsertWithoutMergingUnchecked(gap GapIterator, r Range, val Value) Iterator { gap = gap.node.rebalanceBeforeInsert(gap) + splitMaxGap := trackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ + if splitMaxGap { + gap.node.updateMaxGapLeaf() + } return Iterator{gap.node, gap.index} } @@ -399,12 +440,23 @@ func (s *Set) Remove(seg Iterator) GapIterator { // overlap. seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) + // Need to update the nextAdjacentNode's maxGap because the gap in between + // must have been modified by updating seg.Range() to victim.Range(). + // seg.NextSegment() must exist since the last segment can't be in a + // non-leaf node. + nextAdjacentNode := seg.NextSegment().node + if trackGaps != 0 { + nextAdjacentNode.updateMaxGapLeaf() + } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) Functions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- + if trackGaps != 0 { + seg.node.updateMaxGapLeaf() + } return seg.node.rebalanceAfterRemove(GapIterator{seg.node, seg.index}) } @@ -455,6 +507,7 @@ func (s *Set) MergeUnchecked(first, second Iterator) Iterator { // overlaps second. first.SetEndUnchecked(second.End()) first.SetValue(mval) + // Remove will handle the maxGap update if necessary. return s.Remove(second).PrevSegment() } } @@ -631,6 +684,12 @@ type node struct { // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool + // The longest gap within this node. If the node is a leaf, it's simply the + // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys + // including the 0th and nrSegments-th gap possibly shared with its upper-level + // nodes; if it's a non-leaf node, it's the max of all children's maxGap. + maxGap dynamicGap + // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [maxDegree - 1]Range @@ -676,12 +735,12 @@ func (n *node) nextSibling() *node { // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *node) rebalanceBeforeInsert(gap GapIterator) GapIterator { - if n.parent != nil { - gap = n.parent.rebalanceBeforeInsert(gap) - } if n.nrSegments < maxDegree-1 { return gap } + if n.parent != nil { + gap = n.parent.rebalanceBeforeInsert(gap) + } if n.parent == nil { // n is root. Move all segments before and after n's median segment // into new child nodes adjacent to the median segment, which is now @@ -719,6 +778,13 @@ func (n *node) rebalanceBeforeInsert(gap GapIterator) GapIterator { n.hasChildren = true n.children[0] = left n.children[1] = right + // In this case, n's maxGap won't violated as it's still the root, + // but the left and right children should be updated locally as they + // are newly split from n. + if trackGaps != 0 { + left.updateMaxGapLocal() + right.updateMaxGapLocal() + } if gap.node != n { return gap } @@ -758,6 +824,12 @@ func (n *node) rebalanceBeforeInsert(gap GapIterator) GapIterator { } } n.nrSegments = minDegree - 1 + // MaxGap of n's parent is not violated because the segments within is not changed. + // n and its sibling's maxGap need to be updated locally as they are two new nodes split from old n. + if trackGaps != 0 { + n.updateMaxGapLocal() + sibling.updateMaxGapLocal() + } // gap.node can't be n.parent because gaps are always in leaf nodes. if gap.node != n { return gap @@ -821,6 +893,12 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator { } n.nrSegments++ sibling.nrSegments-- + // n's parent's maxGap does not need to be updated as its content is unmodified. + // n and its sibling must be updated with (new) maxGap because of the shift of keys. + if trackGaps != 0 { + n.updateMaxGapLocal() + sibling.updateMaxGapLocal() + } if gap.node == sibling && gap.index == sibling.nrSegments { return GapIterator{n, 0} } @@ -849,6 +927,12 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator { } n.nrSegments++ sibling.nrSegments-- + // n's parent's maxGap does not need to be updated as its content is unmodified. + // n and its sibling must be updated with (new) maxGap because of the shift of keys. + if trackGaps != 0 { + n.updateMaxGapLocal() + sibling.updateMaxGapLocal() + } if gap.node == sibling { if gap.index == 0 { return GapIterator{n, n.nrSegments} @@ -886,6 +970,7 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator { p.children[0] = nil p.children[1] = nil } + // No need to update maxGap of p as its content is not changed. if gap.node == left { return GapIterator{p, gap.index} } @@ -932,11 +1017,152 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator { } p.children[p.nrSegments] = nil p.nrSegments-- + // Update maxGap of left locally, no need to change p and right because + // p's contents is not changed and right is already invalid. + if trackGaps != 0 { + left.updateMaxGapLocal() + } // This process robs p of one segment, so recurse into rebalancing p. n = p } } +// updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no +// necessary update. +// +// Preconditions: n must be a leaf node, trackGaps must be 1. +func (n *node) updateMaxGapLeaf() { + if n.hasChildren { + panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) + } + max := n.calculateMaxGapLeaf() + if max == n.maxGap.Get() { + // If new max equals the old maxGap, no update is needed. + return + } + oldMax := n.maxGap.Get() + n.maxGap.Set(max) + if max > oldMax { + // Grow ancestor maxGaps. + for p := n.parent; p != nil; p = p.parent { + if p.maxGap.Get() >= max { + // p and its ancestors already contain an equal or larger gap. + break + } + // Only if new maxGap is larger than parent's + // old maxGap, propagate this update to parent. + p.maxGap.Set(max) + } + return + } + // Shrink ancestor maxGaps. + for p := n.parent; p != nil; p = p.parent { + if p.maxGap.Get() > oldMax { + // p and its ancestors still contain a larger gap. + break + } + // If new max is smaller than the old maxGap, and this gap used + // to be the maxGap of its parent, iterate parent's children + // and calculate parent's new maxGap.(It's probable that parent + // has two children with the old maxGap, but we need to check it anyway.) + parentNewMax := p.calculateMaxGapInternal() + if p.maxGap.Get() == parentNewMax { + // p and its ancestors still contain a gap of at least equal size. + break + } + // If p's new maxGap differs from the old one, propagate this update. + p.maxGap.Set(parentNewMax) + } +} + +// updateMaxGapLocal updates maxGap of the calling node solely with no +// propagation to ancestor nodes. +// +// Precondition: trackGaps must be 1. +func (n *node) updateMaxGapLocal() { + if !n.hasChildren { + // Leaf node iterates its gaps. + n.maxGap.Set(n.calculateMaxGapLeaf()) + } else { + // Non-leaf node iterates its children. + n.maxGap.Set(n.calculateMaxGapInternal()) + } +} + +// calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the +// max. +// +// Preconditions: n must be a leaf node. +func (n *node) calculateMaxGapLeaf() Key { + max := GapIterator{n, 0}.Range().Length() + for i := 1; i <= n.nrSegments; i++ { + if current := (GapIterator{n, i}).Range().Length(); current > max { + max = current + } + } + return max +} + +// calculateMaxGapInternal iterates children's maxGap within an internal node n +// and calculate the max. +// +// Preconditions: n must be a non-leaf node. +func (n *node) calculateMaxGapInternal() Key { + max := n.children[0].maxGap.Get() + for i := 1; i <= n.nrSegments; i++ { + if current := n.children[i].maxGap.Get(); current > max { + max = current + } + } + return max +} + +// searchFirstLargeEnoughGap returns the first gap having at least minSize length +// in the subtree rooted by n. If not found, return a terminal gap iterator. +func (n *node) searchFirstLargeEnoughGap(minSize Key) GapIterator { + if n.maxGap.Get() < minSize { + return GapIterator{} + } + if n.hasChildren { + for i := 0; i <= n.nrSegments; i++ { + if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { + return largeEnoughGap + } + } + } else { + for i := 0; i <= n.nrSegments; i++ { + currentGap := GapIterator{n, i} + if currentGap.Range().Length() >= minSize { + return currentGap + } + } + } + panic(fmt.Sprintf("invalid maxGap in %v", n)) +} + +// searchLastLargeEnoughGap returns the last gap having at least minSize length +// in the subtree rooted by n. If not found, return a terminal gap iterator. +func (n *node) searchLastLargeEnoughGap(minSize Key) GapIterator { + if n.maxGap.Get() < minSize { + return GapIterator{} + } + if n.hasChildren { + for i := n.nrSegments; i >= 0; i-- { + if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { + return largeEnoughGap + } + } + } else { + for i := n.nrSegments; i >= 0; i-- { + currentGap := GapIterator{n, i} + if currentGap.Range().Length() >= minSize { + return currentGap + } + } + } + panic(fmt.Sprintf("invalid maxGap in %v", n)) +} + // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or @@ -1243,6 +1469,122 @@ func (gap GapIterator) NextGap() GapIterator { return seg.NextGap() } +// NextLargeEnoughGap returns the iterated gap's first next gap with larger +// length than minSize. If not found, return a terminal gap iterator (does NOT +// include this gap itself). +// +// Precondition: trackGaps must be 1. +func (gap GapIterator) NextLargeEnoughGap(minSize Key) GapIterator { + if trackGaps != 1 { + panic("set is not tracking gaps") + } + if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { + // If gap is the trailing gap of an non-leaf node, + // translate it to the equivalent gap on leaf level. + gap.node = gap.NextSegment().node + gap.index = 0 + return gap.nextLargeEnoughGapHelper(minSize) + } + return gap.nextLargeEnoughGapHelper(minSize) +} + +// nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap +// to do the real recursions. +// +// Preconditions: gap is NOT the trailing gap of a non-leaf node. +func (gap GapIterator) nextLargeEnoughGapHelper(minSize Key) GapIterator { + // Crawl up the tree if no large enough gap in current node or the + // current gap is the trailing one on leaf level. + for gap.node != nil && + (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { + gap.node, gap.index = gap.node.parent, gap.node.parentIndex + } + // If no large enough gap throughout the whole set, return a terminal + // gap iterator. + if gap.node == nil { + return GapIterator{} + } + // Iterate subsequent gaps. + gap.index++ + for gap.index <= gap.node.nrSegments { + if gap.node.hasChildren { + if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { + return largeEnoughGap + } + } else { + if gap.Range().Length() >= minSize { + return gap + } + } + gap.index++ + } + gap.node, gap.index = gap.node.parent, gap.node.parentIndex + if gap.node != nil && gap.index == gap.node.nrSegments { + // If gap is the trailing gap of a non-leaf node, crawl up to + // parent again and do recursion. + gap.node, gap.index = gap.node.parent, gap.node.parentIndex + } + return gap.nextLargeEnoughGapHelper(minSize) +} + +// PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or +// equal length than minSize. If not found, return a terminal gap iterator +// (does NOT include this gap itself). +// +// Precondition: trackGaps must be 1. +func (gap GapIterator) PrevLargeEnoughGap(minSize Key) GapIterator { + if trackGaps != 1 { + panic("set is not tracking gaps") + } + if gap.node != nil && gap.node.hasChildren && gap.index == 0 { + // If gap is the first gap of an non-leaf node, + // translate it to the equivalent gap on leaf level. + gap.node = gap.PrevSegment().node + gap.index = gap.node.nrSegments + return gap.prevLargeEnoughGapHelper(minSize) + } + return gap.prevLargeEnoughGapHelper(minSize) +} + +// prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap +// to do the real recursions. +// +// Preconditions: gap is NOT the first gap of a non-leaf node. +func (gap GapIterator) prevLargeEnoughGapHelper(minSize Key) GapIterator { + // Crawl up the tree if no large enough gap in current node or the + // current gap is the first one on leaf level. + for gap.node != nil && + (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { + gap.node, gap.index = gap.node.parent, gap.node.parentIndex + } + // If no large enough gap throughout the whole set, return a terminal + // gap iterator. + if gap.node == nil { + return GapIterator{} + } + // Iterate previous gaps. + gap.index-- + for gap.index >= 0 { + if gap.node.hasChildren { + if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { + return largeEnoughGap + } + } else { + if gap.Range().Length() >= minSize { + return gap + } + } + gap.index-- + } + gap.node, gap.index = gap.node.parent, gap.node.parentIndex + if gap.node != nil && gap.index == 0 { + // If gap is the first gap of a non-leaf node, crawl up to + // parent again and do recursion. + gap.node, gap.index = gap.node.parent, gap.node.parentIndex + } + return gap.prevLargeEnoughGapHelper(minSize) +} + // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. @@ -1271,7 +1613,7 @@ func segmentAfterPosition(n *node, i int) Iterator { func zeroValueSlice(slice []Value) { // TODO(jamieliu): check if Go is actually smart enough to optimize a - // ClearValue that assigns nil to a memset here + // ClearValue that assigns nil to a memset here. for i := range slice { Functions{}.ClearValue(&slice[i]) } @@ -1310,7 +1652,15 @@ func (n *node) writeDebugString(buf *bytes.Buffer, prefix string) { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) - buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + if n.hasChildren { + if trackGaps != 0 { + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) + } else { + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + } + } else { + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) @@ -1362,3 +1712,43 @@ func (s *Set) ImportSortedSlices(sds *SegmentDataSlices) error { } return nil } + +// segmentTestCheck returns an error if s is incorrectly sorted, does not +// contain exactly expectedSegments segments, or contains a segment which +// fails the passed check. +// +// This should be used only for testing, and has been added to this package for +// templating convenience. +func (s *Set) segmentTestCheck(expectedSegments int, segFunc func(int, Range, Value) error) error { + havePrev := false + prev := Key(0) + nrSegments := 0 + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + next := seg.Start() + if havePrev && prev >= next { + return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) + } + if segFunc != nil { + if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { + return err + } + } + prev = next + havePrev = true + nrSegments++ + } + if nrSegments != expectedSegments { + return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) + } + return nil +} + +// countSegments counts the number of segments in the set. +// +// Similar to Check, this should only be used for testing. +func (s *Set) countSegments() (segments int) { + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + segments++ + } + return segments +} diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD index f2d8462d8..131bf09b9 100644 --- a/pkg/segment/test/BUILD +++ b/pkg/segment/test/BUILD @@ -29,10 +29,28 @@ go_template_instance( }, ) +go_template_instance( + name = "gap_set", + out = "gap_set.go", + consts = { + "trackGaps": "1", + }, + package = "segment", + prefix = "gap", + template = "//pkg/segment:generic_set", + types = { + "Key": "int", + "Range": "Range", + "Value": "int", + "Functions": "gapSetFunctions", + }, +) + go_library( name = "segment", testonly = 1, srcs = [ + "gap_set.go", "int_range.go", "int_set.go", "set_functions.go", diff --git a/pkg/segment/test/segment_test.go b/pkg/segment/test/segment_test.go index 97b16c158..85fa19096 100644 --- a/pkg/segment/test/segment_test.go +++ b/pkg/segment/test/segment_test.go @@ -17,6 +17,7 @@ package segment import ( "fmt" "math/rand" + "reflect" "testing" ) @@ -32,61 +33,65 @@ const ( // valueOffset is the difference between the value and start of test // segments. valueOffset = 100000 + + // intervalLength is the interval used by random gap tests. + intervalLength = 10 ) func shuffle(xs []int) { - for i := range xs { - j := rand.Intn(i + 1) - xs[i], xs[j] = xs[j], xs[i] - } + rand.Shuffle(len(xs), func(i, j int) { xs[i], xs[j] = xs[j], xs[i] }) } -func randPermutation(size int) []int { +func randIntervalPermutation(size int) []int { p := make([]int, size) for i := range p { - p[i] = i + p[i] = intervalLength * i } shuffle(p) return p } -// checkSet returns an error if s is incorrectly sorted, does not contain -// exactly expectedSegments segments, or contains a segment for which val != -// key + valueOffset. -func checkSet(s *Set, expectedSegments int) error { - havePrev := false - prev := 0 - nrSegments := 0 - for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - next := seg.Start() - if havePrev && prev >= next { - return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) - } - if got, want := seg.Value(), seg.Start()+valueOffset; got != want { - return fmt.Errorf("segment %d has key %d, value %d (expected %d)", nrSegments, seg.Start(), got, want) - } - prev = next - havePrev = true - nrSegments++ - } - if nrSegments != expectedSegments { - return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) +// validate can be passed to Check. +func validate(nr int, r Range, v int) error { + if got, want := v, r.Start+valueOffset; got != want { + return fmt.Errorf("segment %d has key %d, value %d (expected %d)", nr, r.Start, got, want) } return nil } -// countSegmentsIn returns the number of segments in s. -func countSegmentsIn(s *Set) int { - var count int - for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - count++ +// checkSetMaxGap returns an error if maxGap inside all nodes of s is not well +// maintained. +func checkSetMaxGap(s *gapSet) error { + n := s.root + return checkNodeMaxGap(&n) +} + +// checkNodeMaxGap returns an error if maxGap inside the subtree rooted by n is +// not well maintained. +func checkNodeMaxGap(n *gapnode) error { + var max int + if !n.hasChildren { + max = n.calculateMaxGapLeaf() + } else { + for i := 0; i <= n.nrSegments; i++ { + child := n.children[i] + if err := checkNodeMaxGap(child); err != nil { + return err + } + if temp := child.maxGap.Get(); i == 0 || temp > max { + max = temp + } + } + } + if max != n.maxGap.Get() { + return fmt.Errorf("maxGap wrong in node\n%vexpected: %d got: %d", n, max, n.maxGap) } - return count + return nil } func TestAddRandom(t *testing.T) { var s Set - order := randPermutation(testSize) + order := rand.Perm(testSize) var nrInsertions int for i, j := range order { if !s.AddWithoutMerging(Range{j, j + 1}, j+valueOffset) { @@ -94,12 +99,12 @@ func TestAddRandom(t *testing.T) { break } nrInsertions++ - if err := checkSet(&s, nrInsertions); err != nil { + if err := s.segmentTestCheck(nrInsertions, validate); err != nil { t.Errorf("Iteration %d: %v", i, err) break } } - if got, want := countSegmentsIn(&s), nrInsertions; got != want { + if got, want := s.countSegments(), nrInsertions; got != want { t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want) } if t.Failed() { @@ -115,7 +120,156 @@ func TestRemoveRandom(t *testing.T) { t.Fatalf("Failed to insert segment %d", i) } } - order := randPermutation(testSize) + order := rand.Perm(testSize) + var nrRemovals int + for i, j := range order { + seg := s.FindSegment(j) + if !seg.Ok() { + t.Errorf("Iteration %d: failed to find segment with key %d", i, j) + break + } + s.Remove(seg) + nrRemovals++ + if err := s.segmentTestCheck(testSize-nrRemovals, validate); err != nil { + t.Errorf("Iteration %d: %v", i, err) + break + } + } + if got, want := s.countSegments(), testSize-nrRemovals; got != want { + t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want) + } + if t.Failed() { + t.Logf("Removal order: %v", order[:nrRemovals]) + t.Logf("Set contents:\n%v", &s) + t.FailNow() + } +} + +func TestMaxGapAddRandom(t *testing.T) { + var s gapSet + order := rand.Perm(testSize) + var nrInsertions int + for i, j := range order { + if !s.AddWithoutMerging(Range{j, j + 1}, j+valueOffset) { + t.Errorf("Iteration %d: failed to insert segment with key %d", i, j) + break + } + nrInsertions++ + if err := s.segmentTestCheck(nrInsertions, validate); err != nil { + t.Errorf("Iteration %d: %v", i, err) + break + } + if err := checkSetMaxGap(&s); err != nil { + t.Errorf("When inserting %d: %v", j, err) + break + } + } + if got, want := s.countSegments(), nrInsertions; got != want { + t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want) + } + if t.Failed() { + t.Logf("Insertion order: %v", order[:nrInsertions]) + t.Logf("Set contents:\n%v", &s) + } +} + +func TestMaxGapAddRandomWithRandomInterval(t *testing.T) { + var s gapSet + order := randIntervalPermutation(testSize) + var nrInsertions int + for i, j := range order { + if !s.AddWithoutMerging(Range{j, j + rand.Intn(intervalLength-1) + 1}, j+valueOffset) { + t.Errorf("Iteration %d: failed to insert segment with key %d", i, j) + break + } + nrInsertions++ + if err := s.segmentTestCheck(nrInsertions, validate); err != nil { + t.Errorf("Iteration %d: %v", i, err) + break + } + if err := checkSetMaxGap(&s); err != nil { + t.Errorf("When inserting %d: %v", j, err) + break + } + } + if got, want := s.countSegments(), nrInsertions; got != want { + t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want) + } + if t.Failed() { + t.Logf("Insertion order: %v", order[:nrInsertions]) + t.Logf("Set contents:\n%v", &s) + } +} + +func TestMaxGapAddRandomWithMerge(t *testing.T) { + var s gapSet + order := randIntervalPermutation(testSize) + nrInsertions := 1 + for i, j := range order { + if !s.Add(Range{j, j + intervalLength}, j+valueOffset) { + t.Errorf("Iteration %d: failed to insert segment with key %d", i, j) + break + } + if err := checkSetMaxGap(&s); err != nil { + t.Errorf("When inserting %d: %v", j, err) + break + } + } + if got, want := s.countSegments(), nrInsertions; got != want { + t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want) + } + if t.Failed() { + t.Logf("Insertion order: %v", order) + t.Logf("Set contents:\n%v", &s) + } +} + +func TestMaxGapRemoveRandom(t *testing.T) { + var s gapSet + for i := 0; i < testSize; i++ { + if !s.AddWithoutMerging(Range{i, i + 1}, i+valueOffset) { + t.Fatalf("Failed to insert segment %d", i) + } + } + order := rand.Perm(testSize) + var nrRemovals int + for i, j := range order { + seg := s.FindSegment(j) + if !seg.Ok() { + t.Errorf("Iteration %d: failed to find segment with key %d", i, j) + break + } + temprange := seg.Range() + s.Remove(seg) + nrRemovals++ + if err := s.segmentTestCheck(testSize-nrRemovals, validate); err != nil { + t.Errorf("Iteration %d: %v", i, err) + break + } + if err := checkSetMaxGap(&s); err != nil { + t.Errorf("When removing %v: %v", temprange, err) + break + } + } + if got, want := s.countSegments(), testSize-nrRemovals; got != want { + t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want) + } + if t.Failed() { + t.Logf("Removal order: %v", order[:nrRemovals]) + t.Logf("Set contents:\n%v", &s) + t.FailNow() + } +} + +func TestMaxGapRemoveHalfRandom(t *testing.T) { + var s gapSet + for i := 0; i < testSize; i++ { + if !s.AddWithoutMerging(Range{intervalLength * i, intervalLength*i + rand.Intn(intervalLength-1) + 1}, intervalLength*i+valueOffset) { + t.Fatalf("Failed to insert segment %d", i) + } + } + order := randIntervalPermutation(testSize) + order = order[:testSize/2] var nrRemovals int for i, j := range order { seg := s.FindSegment(j) @@ -123,14 +277,19 @@ func TestRemoveRandom(t *testing.T) { t.Errorf("Iteration %d: failed to find segment with key %d", i, j) break } + temprange := seg.Range() s.Remove(seg) nrRemovals++ - if err := checkSet(&s, testSize-nrRemovals); err != nil { + if err := s.segmentTestCheck(testSize-nrRemovals, validate); err != nil { t.Errorf("Iteration %d: %v", i, err) break } + if err := checkSetMaxGap(&s); err != nil { + t.Errorf("When removing %v: %v", temprange, err) + break + } } - if got, want := countSegmentsIn(&s), testSize-nrRemovals; got != want { + if got, want := s.countSegments(), testSize-nrRemovals; got != want { t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want) } if t.Failed() { @@ -140,6 +299,148 @@ func TestRemoveRandom(t *testing.T) { } } +func TestMaxGapAddRandomRemoveRandomHalfWithMerge(t *testing.T) { + var s gapSet + order := randIntervalPermutation(testSize * 2) + order = order[:testSize] + for i, j := range order { + if !s.Add(Range{j, j + intervalLength}, j+valueOffset) { + t.Errorf("Iteration %d: failed to insert segment with key %d", i, j) + break + } + if err := checkSetMaxGap(&s); err != nil { + t.Errorf("When inserting %d: %v", j, err) + break + } + } + shuffle(order) + var nrRemovals int + for _, j := range order { + seg := s.FindSegment(j) + if !seg.Ok() { + continue + } + temprange := seg.Range() + s.Remove(seg) + nrRemovals++ + if err := checkSetMaxGap(&s); err != nil { + t.Errorf("When removing %v: %v", temprange, err) + break + } + } + if t.Failed() { + t.Logf("Removal order: %v", order[:nrRemovals]) + t.Logf("Set contents:\n%v", &s) + t.FailNow() + } +} + +func TestNextLargeEnoughGap(t *testing.T) { + var s gapSet + order := randIntervalPermutation(testSize * 2) + order = order[:testSize] + for i, j := range order { + if !s.Add(Range{j, j + rand.Intn(intervalLength-1) + 1}, j+valueOffset) { + t.Errorf("Iteration %d: failed to insert segment with key %d", i, j) + break + } + if err := checkSetMaxGap(&s); err != nil { + t.Errorf("When inserting %d: %v", j, err) + break + } + } + shuffle(order) + order = order[:testSize/2] + for _, j := range order { + seg := s.FindSegment(j) + if !seg.Ok() { + continue + } + temprange := seg.Range() + s.Remove(seg) + if err := checkSetMaxGap(&s); err != nil { + t.Errorf("When removing %v: %v", temprange, err) + break + } + } + minSize := 7 + var gapArr1 []int + for gap := s.LowerBoundGap(0).NextLargeEnoughGap(minSize); gap.Ok(); gap = gap.NextLargeEnoughGap(minSize) { + if gap.Range().Length() < minSize { + t.Errorf("NextLargeEnoughGap wrong, gap %v has length %d, wanted %d", gap.Range(), gap.Range().Length(), minSize) + } else { + gapArr1 = append(gapArr1, gap.Range().Start) + } + } + var gapArr2 []int + for gap := s.LowerBoundGap(0).NextGap(); gap.Ok(); gap = gap.NextGap() { + if gap.Range().Length() >= minSize { + gapArr2 = append(gapArr2, gap.Range().Start) + } + } + + if !reflect.DeepEqual(gapArr2, gapArr1) { + t.Errorf("Search result not correct, got: %v, wanted: %v", gapArr1, gapArr2) + } + if t.Failed() { + t.Logf("Set contents:\n%v", &s) + t.FailNow() + } +} + +func TestPrevLargeEnoughGap(t *testing.T) { + var s gapSet + order := randIntervalPermutation(testSize * 2) + order = order[:testSize] + for i, j := range order { + if !s.Add(Range{j, j + rand.Intn(intervalLength-1) + 1}, j+valueOffset) { + t.Errorf("Iteration %d: failed to insert segment with key %d", i, j) + break + } + if err := checkSetMaxGap(&s); err != nil { + t.Errorf("When inserting %d: %v", j, err) + break + } + } + end := s.LastSegment().End() + shuffle(order) + order = order[:testSize/2] + for _, j := range order { + seg := s.FindSegment(j) + if !seg.Ok() { + continue + } + temprange := seg.Range() + s.Remove(seg) + if err := checkSetMaxGap(&s); err != nil { + t.Errorf("When removing %v: %v", temprange, err) + break + } + } + minSize := 7 + var gapArr1 []int + for gap := s.UpperBoundGap(end + intervalLength).PrevLargeEnoughGap(minSize); gap.Ok(); gap = gap.PrevLargeEnoughGap(minSize) { + if gap.Range().Length() < minSize { + t.Errorf("PrevLargeEnoughGap wrong, gap length %d, wanted %d", gap.Range().Length(), minSize) + } else { + gapArr1 = append(gapArr1, gap.Range().Start) + } + } + var gapArr2 []int + for gap := s.UpperBoundGap(end + intervalLength).PrevGap(); gap.Ok(); gap = gap.PrevGap() { + if gap.Range().Length() >= minSize { + gapArr2 = append(gapArr2, gap.Range().Start) + } + } + if !reflect.DeepEqual(gapArr2, gapArr1) { + t.Errorf("Search result not correct, got: %v, wanted: %v", gapArr1, gapArr2) + } + if t.Failed() { + t.Logf("Set contents:\n%v", &s) + t.FailNow() + } +} + func TestAddSequentialAdjacent(t *testing.T) { var s Set var nrInsertions int @@ -148,12 +449,12 @@ func TestAddSequentialAdjacent(t *testing.T) { t.Fatalf("Failed to insert segment %d", i) } nrInsertions++ - if err := checkSet(&s, nrInsertions); err != nil { + if err := s.segmentTestCheck(nrInsertions, validate); err != nil { t.Errorf("Iteration %d: %v", i, err) break } } - if got, want := countSegmentsIn(&s), nrInsertions; got != want { + if got, want := s.countSegments(), nrInsertions; got != want { t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want) } if t.Failed() { @@ -202,12 +503,12 @@ func TestAddSequentialNonAdjacent(t *testing.T) { t.Fatalf("Failed to insert segment %d", i) } nrInsertions++ - if err := checkSet(&s, nrInsertions); err != nil { + if err := s.segmentTestCheck(nrInsertions, validate); err != nil { t.Errorf("Iteration %d: %v", i, err) break } } - if got, want := countSegmentsIn(&s), nrInsertions; got != want { + if got, want := s.countSegments(), nrInsertions; got != want { t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want) } if t.Failed() { @@ -293,7 +594,7 @@ Tests: var i int for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { if i > len(test.final) { - t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, countSegmentsIn(&s), len(test.final), &s) + t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, s.countSegments(), len(test.final), &s) continue Tests } if got, want := seg.Range(), test.final[i]; got != want { @@ -351,7 +652,7 @@ Tests: var i int for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { if i > len(test.final) { - t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, countSegmentsIn(&s), len(test.final), &s) + t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, s.countSegments(), len(test.final), &s) continue Tests } if got, want := seg.Range(), test.final[i]; got != want { @@ -378,7 +679,7 @@ func benchmarkAddSequential(b *testing.B, size int) { } func benchmarkAddRandom(b *testing.B, size int) { - order := randPermutation(size) + order := rand.Perm(size) b.ResetTimer() for n := 0; n < b.N; n++ { @@ -416,7 +717,7 @@ func benchmarkFindRandom(b *testing.B, size int) { b.Fatalf("Failed to insert segment %d", i) } } - order := randPermutation(size) + order := rand.Perm(size) b.ResetTimer() for n := 0; n < b.N; n++ { @@ -470,7 +771,7 @@ func benchmarkAddFindRemoveSequential(b *testing.B, size int) { } func benchmarkAddFindRemoveRandom(b *testing.B, size int) { - order := randPermutation(size) + order := rand.Perm(size) b.ResetTimer() for n := 0; n < b.N; n++ { diff --git a/pkg/segment/test/set_functions.go b/pkg/segment/test/set_functions.go index bcddb39bb..7cd895cc7 100644 --- a/pkg/segment/test/set_functions.go +++ b/pkg/segment/test/set_functions.go @@ -14,21 +14,16 @@ package segment -// Basic numeric constants that we define because the math package doesn't. -// TODO(nlacasse): These should be Math.MaxInt64/MinInt64? -const ( - maxInt = int(^uint(0) >> 1) - minInt = -maxInt - 1 -) - type setFunctions struct{} -func (setFunctions) MinKey() int { - return minInt +// MinKey returns the minimum key for the set. +func (s setFunctions) MinKey() int { + return -s.MaxKey() - 1 } +// MaxKey returns the maximum key for the set. func (setFunctions) MaxKey() int { - return maxInt + return int(^uint(0) >> 1) } func (setFunctions) ClearValue(*int) {} @@ -40,3 +35,20 @@ func (setFunctions) Merge(_ Range, val1 int, _ Range, _ int) (int, bool) { func (setFunctions) Split(_ Range, val int, _ int) (int, int) { return val, val } + +type gapSetFunctions struct { + setFunctions +} + +// MinKey is adjusted to make sure no add overflow would happen in test cases. +// e.g. A gap with range {MinInt32, 2} would cause overflow in Range().Length(). +// +// Normally Keys should be unsigned to avoid these issues. +func (s gapSetFunctions) MinKey() int { + return s.setFunctions.MinKey() / 2 +} + +// MaxKey returns the maximum key for the set. +func (s gapSetFunctions) MaxKey() int { + return s.setFunctions.MaxKey() / 2 +} diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go index 92d062513..95dfd1e90 100644 --- a/pkg/sentry/arch/syscalls_arm64.go +++ b/pkg/sentry/arch/syscalls_arm64.go @@ -23,7 +23,7 @@ const restartSyscallNr = uintptr(128) // // In linux, at the entry of the syscall handler(el0_svc_common()), value of R0 // is saved to the pt_regs.orig_x0 in kernel code. But currently, the orig_x0 -// was not accessible to the user space application, so we have to do the same +// was not accessible to the userspace application, so we have to do the same // operation in the sentry code to save the R0 value into the App context. func (c *context64) SyscallSaveOrig() { c.OrigR0 = c.Regs.Regs[0] diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go index 6564fd0c6..dd6f5aba6 100644 --- a/pkg/sentry/fs/fsutil/frame_ref_set.go +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -18,6 +18,7 @@ import ( "math" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" ) // FrameRefSetFunctions implements segment.Functions for FrameRefSet. @@ -49,3 +50,42 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform. func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { return val, val } + +// IncRefAndAccount adds a reference on the range fr. All newly inserted segments +// are accounted as host page cache memory mappings. +func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { + seg, gap := refs.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = refs.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + newRange := gap.Range().Intersect(fr) + usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) + seg, gap = refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() + default: + refs.MergeAdjacent(fr) + return + } + } +} + +// DecRefAndAccount removes a reference on the range fr and untracks segments +// that are removed from memory accounting. +func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) { + seg := refs.FindSegment(fr.Start) + + for seg.Ok() && seg.Start() < fr.End { + seg = refs.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) + seg = refs.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + refs.MergeAdjacent(fr) +} diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md index c3988aa43..635cc009b 100644 --- a/pkg/sentry/fs/g3doc/fuse.md +++ b/pkg/sentry/fs/g3doc/fuse.md @@ -7,20 +7,20 @@ currently incomplete and the document will be updated as things progress. # FUSE: Filesystem in Userspace -The sentry supports dispatching filesystem operations to a FUSE server, -allowing FUSE filesystem to be used with a sandbox. +The sentry supports dispatching filesystem operations to a FUSE server, allowing +FUSE filesystem to be used with a sandbox. ## Overview FUSE has two main components: -1. A client kernel driver (canonically `fuse.ko` in Linux), which forwards - filesystem operations (usually initiated by syscalls) to the server. +1. A client kernel driver (canonically `fuse.ko` in Linux), which forwards + filesystem operations (usually initiated by syscalls) to the server. -2. A server, which is a userspace daemon that implements the actual filesystem. +2. A server, which is a userspace daemon that implements the actual filesystem. -The sentry implements the client component, which allows a server daemon -running within the sandbox to implement a filesystem within the sandbox. +The sentry implements the client component, which allows a server daemon running +within the sandbox to implement a filesystem within the sandbox. A FUSE filesystem is initialized with `mount(2)`, typically with the help of a utility like `fusermount(1)`. Various mount options exist for establishing @@ -30,43 +30,43 @@ and server. The FUSE device FD is obtained by opening `/dev/fuse`. During regular operation, the client and server use the FUSE protocol described in `fuse(4)` to service -filesystem operations. See the "Protocol" section below for more -information about this protocol. The core of the sentry support for FUSE is the -client-side implementation of this protocol. +filesystem operations. See the "Protocol" section below for more information +about this protocol. The core of the sentry support for FUSE is the client-side +implementation of this protocol. ## FUSE in the Sentry The sentry's FUSE client targets VFS2 and has the following components: -- An implementation of `/dev/fuse`. +- An implementation of `/dev/fuse`. -- A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting - VFS2, one point of contention may be the lack of inodes in VFS2. We can - tentatively implement a kernfs-based filesystem to bridge the gap in APIs. The - kernfs base functionality can serve the role of the Linux inode cache and, the - filesystem can map VFS2 syscalls to kernfs inode operations; see the - `kernfs.Inode` interface. +- A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting + VFS2, one point of contention may be the lack of inodes in VFS2. We can + tentatively implement a kernfs-based filesystem to bridge the gap in APIs. + The kernfs base functionality can serve the role of the Linux inode cache + and, the filesystem can map VFS2 syscalls to kernfs inode operations; see + the `kernfs.Inode` interface. -The FUSE protocol lends itself well to marshaling with `go_marshal`. The -various request and response packets can be defined in the ABI package and -converted to and from the wire format using `go_marshal`. +The FUSE protocol lends itself well to marshaling with `go_marshal`. The various +request and response packets can be defined in the ABI package and converted to +and from the wire format using `go_marshal`. ### Design Goals -- While filesystem performance is always important, the sentry's FUSE support is - primarily concerned with compatibility, with performance as a secondary - concern. +- While filesystem performance is always important, the sentry's FUSE support + is primarily concerned with compatibility, with performance as a secondary + concern. -- Avoiding deadlocks from a hung server daemon. +- Avoiding deadlocks from a hung server daemon. -- Consider the potential for denial of service from a malicious server - daemon. Protecting itself from userspace is already a design goal for the - sentry, but needs additional consideration for FUSE. Normally, an operating - system doesn't rely on userspace to make progress with filesystem - operations. Since this changes with FUSE, it opens up the possibility of - creating a chain of dependencies controlled by userspace, which could affect - an entire sandbox. For example: a FUSE op can block a syscall, which could be - holding a subsystem lock, which can then block another task goroutine. +- Consider the potential for denial of service from a malicious server daemon. + Protecting itself from userspace is already a design goal for the sentry, + but needs additional consideration for FUSE. Normally, an operating system + doesn't rely on userspace to make progress with filesystem operations. Since + this changes with FUSE, it opens up the possibility of creating a chain of + dependencies controlled by userspace, which could affect an entire sandbox. + For example: a FUSE op can block a syscall, which could be holding a + subsystem lock, which can then block another task goroutine. ### Milestones @@ -76,23 +76,23 @@ ops can be implemented in parallel. #### Minimal client that can mount a trivial FUSE filesystem. -- Implement `/dev/fuse`. +- Implement `/dev/fuse`. -- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`. +- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`. #### Read-only mount with basic file operations -- Implement the majority of file, directory and file descriptor FUSE ops. For - this milestone, we can skip uncommon or complex operations like mmap, mknod, - file locking, poll, and extended attributes. We can stub these out along with - any ops that modify the filesystem. The exact list of required ops are to be - determined, but the goal is to mount a real filesystem as read-only, and be - able to read contents from the filesystem in the sentry. +- Implement the majority of file, directory and file descriptor FUSE ops. For + this milestone, we can skip uncommon or complex operations like mmap, mknod, + file locking, poll, and extended attributes. We can stub these out along + with any ops that modify the filesystem. The exact list of required ops are + to be determined, but the goal is to mount a real filesystem as read-only, + and be able to read contents from the filesystem in the sentry. #### Full read-write support -- Implement the remaining FUSE ops and decide if we can omit rarely used - operations like ioctl. +- Implement the remaining FUSE ops and decide if we can omit rarely used + operations like ioctl. # Appendix @@ -145,19 +145,19 @@ operations map to the sentry virtual filesystem. These operations are specific to FUSE and don't have a corresponding action in a generic filesystem. -- `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the - first message sent by the client after mount. This is used for version and - feature negotiation. This is related to `mount(2)`. -- `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`. -- `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the - `fuse_in_header.unique` value provided in the corresponding request - header. The client can send at most one of these per request, and will enter - an uninterruptible wait for a reply. The server is expected to reply promptly. -- `FUSE_FORGET`: A hint to the server that server should evict the indicate node - from any caches. This is wired up to `(struct super_operations).evict_inode` - in Linux, which is in turned hooked as the inode cache shrinker which is - typically triggered by system memory pressure. -- `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`. +- `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the + first message sent by the client after mount. This is used for version and + feature negotiation. This is related to `mount(2)`. +- `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`. +- `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the + `fuse_in_header.unique` value provided in the corresponding request header. + The client can send at most one of these per request, and will enter an + uninterruptible wait for a reply. The server is expected to reply promptly. +- `FUSE_FORGET`: A hint to the server that server should evict the indicate + node from any caches. This is wired up to `(struct + super_operations).evict_inode` in Linux, which is in turned hooked as the + inode cache shrinker which is typically triggered by system memory pressure. +- `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`. #### Filesystem Syscalls @@ -167,92 +167,94 @@ otherwise noted. Node creation: -- `FUSE_MKNOD` -- `FUSE_MKDIR` -- `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which - atomically creates and opens a node. +- `FUSE_MKNOD` +- `FUSE_MKDIR` +- `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which + atomically creates and opens a node. Node attributes and extended attributes: -- `FUSE_GETATTR` -- `FUSE_SETATTR` -- `FUSE_SETXATTR` -- `FUSE_GETXATTR` -- `FUSE_LISTXATTR` -- `FUSE_REMOVEXATTR` +- `FUSE_GETATTR` +- `FUSE_SETATTR` +- `FUSE_SETXATTR` +- `FUSE_GETXATTR` +- `FUSE_LISTXATTR` +- `FUSE_REMOVEXATTR` Node link manipulation: -- `FUSE_READLINK` -- `FUSE_LINK` -- `FUSE_SYMLINK` -- `FUSE_UNLINK` +- `FUSE_READLINK` +- `FUSE_LINK` +- `FUSE_SYMLINK` +- `FUSE_UNLINK` Directory operations: -- `FUSE_RMDIR` -- `FUSE_RENAME` -- `FUSE_RENAME2` -- `FUSE_OPENDIR`: `open(2)` for directories. -- `FUSE_RELEASEDIR`: `close(2)` for directories. -- `FUSE_READDIR` -- `FUSE_READDIRPLUS` -- `FUSE_FSYNCDIR`: `fsync(2)` for directories. -- `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is - reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path - component to a node. However the returned identifier is opaque to the - client. The server must remember this mapping, as this is how the client will - reference the node in the future. +- `FUSE_RMDIR` +- `FUSE_RENAME` +- `FUSE_RENAME2` +- `FUSE_OPENDIR`: `open(2)` for directories. +- `FUSE_RELEASEDIR`: `close(2)` for directories. +- `FUSE_READDIR` +- `FUSE_READDIRPLUS` +- `FUSE_FSYNCDIR`: `fsync(2)` for directories. +- `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is + reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path + component to a node. However the returned identifier is opaque to the + client. The server must remember this mapping, as this is how the client + will reference the node in the future. File operations: -- `FUSE_OPEN`: `open(2)` for files. -- `FUSE_RELEASE`: `close(2)` for files. -- `FUSE_FSYNC` -- `FUSE_FALLOCATE` -- `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`. -- `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`. +- `FUSE_OPEN`: `open(2)` for files. +- `FUSE_RELEASE`: `close(2)` for files. +- `FUSE_FSYNC` +- `FUSE_FALLOCATE` +- `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`. +- `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`. File locking: -- `FUSE_GETLK` -- `FUSE_SETLK` -- `FUSE_SETLKW` -- `FUSE_COPY_FILE_RANGE` +- `FUSE_GETLK` +- `FUSE_SETLK` +- `FUSE_SETLKW` +- `FUSE_COPY_FILE_RANGE` File descriptor operations: -- `FUSE_IOCTL` -- `FUSE_POLL` -- `FUSE_LSEEK` +- `FUSE_IOCTL` +- `FUSE_POLL` +- `FUSE_LSEEK` Filesystem operations: -- `FUSE_STATFS` +- `FUSE_STATFS` #### Permissions -- `FUSE_ACCESS` is used to check if a node is accessible, as part of many - syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt` - in the sentry. +- `FUSE_ACCESS` is used to check if a node is accessible, as part of many + syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt` in the + sentry. #### I/O Operations These ops are used to read and write file pages. They're used to implement both I/O syscalls like `read(2)`, `write(2)` and `mmap(2)`. -- `FUSE_READ` -- `FUSE_WRITE` +- `FUSE_READ` +- `FUSE_WRITE` #### Miscellaneous -- `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is - closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)` syscall - from the user. Maps to `vfs.FileDescriptorImpl.Release` in the sentry. -- `FUSE_BMAP`: Old address space API for block defrag. Probably not needed. -- `FUSE_NOTIFY_REPLY`: [TODO: what does this do?] +- `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is + closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)` + syscall from the user. Maps to `vfs.FileDescriptorImpl.Release` in the + sentry. +- `FUSE_BMAP`: Old address space API for block defrag. Probably not needed. +- `FUSE_NOTIFY_REPLY`: [TODO: what does this do?] # References -- `fuse(4)` manpage. -- Linux kernel FUSE documentation: https://www.kernel.org/doc/html/latest/filesystems/fuse.html +- `fuse(4)` manpage. +- Linux kernel FUSE documentation: + https://www.kernel.org/doc/html/latest/filesystems/fuse.html diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go index e201801d6..f7bc325d1 100644 --- a/pkg/sentry/fsimpl/devpts/line_discipline.go +++ b/pkg/sentry/fsimpl/devpts/line_discipline.go @@ -27,8 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// LINT.IfChange - const ( // canonMaxBytes is the number of bytes that fit into a single line of // terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE @@ -445,5 +443,3 @@ func (l *lineDiscipline) peek(b []byte) int { } return size } - -// LINT.ThenChange(../../fs/tty/line_discipline.go) diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 04a292927..7a7ce5d81 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -27,8 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// LINT.IfChange - // masterInode is the inode for the master end of the Terminal. type masterInode struct { kernfs.InodeAttrs @@ -222,5 +220,3 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { unimpl.EmitUnimplementedEvent(ctx) } } - -// LINT.ThenChange(../../fs/tty/master.go) diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go index 29a6be858..dffb4232c 100644 --- a/pkg/sentry/fsimpl/devpts/queue.go +++ b/pkg/sentry/fsimpl/devpts/queue.go @@ -25,8 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// LINT.IfChange - // waitBufMaxBytes is the maximum size of a wait buffer. It is based on // TTYB_DEFAULT_MEM_LIMIT. const waitBufMaxBytes = 131072 @@ -236,5 +234,3 @@ func (q *queue) waitBufAppend(b []byte) { q.waitBuf = append(q.waitBuf, b) q.waitBufLen += uint64(len(b)) } - -// LINT.ThenChange(../../fs/tty/queue.go) diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index 0a98dc896..526cd406c 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -26,8 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// LINT.IfChange - // slaveInode is the inode for the slave end of the Terminal. type slaveInode struct { kernfs.InodeAttrs @@ -182,5 +180,3 @@ func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() return sfd.inode.Stat(fs, opts) } - -// LINT.ThenChange(../../fs/tty/slave.go) diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go index b44e673d8..7d2781c54 100644 --- a/pkg/sentry/fsimpl/devpts/terminal.go +++ b/pkg/sentry/fsimpl/devpts/terminal.go @@ -22,8 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// LINT.IfChanges - // Terminal is a pseudoterminal. // // +stateify savable @@ -120,5 +118,3 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY { } return tm.slaveKTTY } - -// LINT.ThenChange(../../fs/tty/terminal.go) diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 5ce82b793..67e916525 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -36,7 +36,6 @@ go_library( "gofer.go", "handle.go", "p9file.go", - "pagemath.go", "regular_file.go", "socket.go", "special_file.go", diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 353e2cf5b..6295f6b54 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -869,8 +869,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin Size: stat.Mask&linux.STATX_SIZE != 0, ATime: stat.Mask&linux.STATX_ATIME != 0, MTime: stat.Mask&linux.STATX_MTIME != 0, - ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, - MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, + ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW, }, p9.SetAttr{ Permissions: p9.FileMode(stat.Mode), UID: p9.UID(stat.UID), @@ -928,8 +928,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin // so we can't race with Write or another truncate.) d.dataMu.Unlock() if d.size < oldSize { - oldpgend := pageRoundUp(oldSize) - newpgend := pageRoundUp(d.size) + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(d.size) if oldpgend != newpgend { d.mapsMu.Lock() d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 857f7c74e..0d10cf7ac 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -148,9 +148,9 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, err } // Remove touched pages from the cache. - pgstart := pageRoundDown(uint64(offset)) - pgend := pageRoundUp(uint64(offset + src.NumBytes())) - if pgend < pgstart { + pgstart := usermem.PageRoundDown(uint64(offset)) + pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes())) + if !ok { return 0, syserror.EINVAL } mr := memmap.MappableRange{pgstart, pgend} @@ -306,9 +306,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) if fillCache { // Read into the cache, then re-enter the loop to read from the // cache. + gapEnd, _ := usermem.PageRoundUp(gapMR.End) reqMR := memmap.MappableRange{ - Start: pageRoundDown(gapMR.Start), - End: pageRoundUp(gapMR.End), + Start: usermem.PageRoundDown(gapMR.Start), + End: gapEnd, } optMR := gap.Range() err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) @@ -671,7 +672,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab // Constrain translations to d.size (rounded up) to prevent translation to // pages that may be concurrently truncated. - pgend := pageRoundUp(d.size) + pgend, _ := usermem.PageRoundUp(d.size) var beyondEOF bool if required.End > pgend { if required.Start >= pgend { @@ -818,43 +819,15 @@ type dentryPlatformFile struct { // IncRef implements platform.File.IncRef. func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { d.dataMu.Lock() - seg, gap := d.fdRefs.Find(fr.Start) - for { - switch { - case seg.Ok() && seg.Start() < fr.End: - seg = d.fdRefs.Isolate(seg, fr) - seg.SetValue(seg.Value() + 1) - seg, gap = seg.NextNonEmpty() - case gap.Ok() && gap.Start() < fr.End: - newRange := gap.Range().Intersect(fr) - usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) - seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() - default: - d.fdRefs.MergeAdjacent(fr) - d.dataMu.Unlock() - return - } - } + d.fdRefs.IncRefAndAccount(fr) + d.dataMu.Unlock() } // DecRef implements platform.File.DecRef. func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { d.dataMu.Lock() - seg := d.fdRefs.FindSegment(fr.Start) - - for seg.Ok() && seg.Start() < fr.End { - seg = d.fdRefs.Isolate(seg, fr) - if old := seg.Value(); old == 1 { - usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) - seg = d.fdRefs.Remove(seg).NextSegment() - } else { - seg.SetValue(old - 1) - seg = seg.NextSegment() - } - } - d.fdRefs.MergeAdjacent(fr) + d.fdRefs.DecRefAndAccount(fr) d.dataMu.Unlock() - } // MapInternal implements platform.File.MapInternal. diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 39509f703..ca0fe6d2b 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -8,6 +8,7 @@ go_library( "control.go", "host.go", "ioctl_unsafe.go", + "mmap.go", "socket.go", "socket_iovec.go", "socket_unsafe.go", @@ -23,12 +24,15 @@ go_library( "//pkg/fspath", "//pkg/log", "//pkg/refs", + "//pkg/safemem", "//pkg/sentry/arch", + "//pkg/sentry/fs/fsutil", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/hostfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/platform", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 8caf55a1b..18b127521 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -86,15 +86,13 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) i := &inode{ hostFD: hostFD, - seekable: seekable, + ino: fs.NextIno(), isTTY: opts.IsTTY, - canMap: canMap(uint32(fileType)), wouldBlock: wouldBlock(uint32(fileType)), - ino: fs.NextIno(), - // For simplicity, set offset to 0. Technically, we should use the existing - // offset on the host if the file is seekable. - offset: 0, + seekable: seekable, + canMap: canMap(uint32(fileType)), } + i.pf.inode = i // Non-seekable files can't be memory mapped, assert this. if !i.seekable && i.canMap { @@ -117,6 +115,10 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) // i.open will take a reference on d. defer d.DecRef() + + // For simplicity, fileDescription.offset is set to 0. Technically, we + // should only set to 0 on files that are not seekable (sockets, pipes, + // etc.), and use the offset from the host fd otherwise when importing. return i.open(ctx, d.VFSDentry(), mnt, flags) } @@ -189,11 +191,15 @@ type inode struct { // This field is initialized at creation time and is immutable. hostFD int - // wouldBlock is true if the host FD would return EWOULDBLOCK for - // operations that would block. + // ino is an inode number unique within this filesystem. // // This field is initialized at creation time and is immutable. - wouldBlock bool + ino uint64 + + // isTTY is true if this file represents a TTY. + // + // This field is initialized at creation time and is immutable. + isTTY bool // seekable is false if the host fd points to a file representing a stream, // e.g. a socket or a pipe. Such files are not seekable and can return @@ -202,29 +208,29 @@ type inode struct { // This field is initialized at creation time and is immutable. seekable bool - // isTTY is true if this file represents a TTY. + // wouldBlock is true if the host FD would return EWOULDBLOCK for + // operations that would block. // // This field is initialized at creation time and is immutable. - isTTY bool + wouldBlock bool + + // Event queue for blocking operations. + queue waiter.Queue // canMap specifies whether we allow the file to be memory mapped. // // This field is initialized at creation time and is immutable. canMap bool - // ino is an inode number unique within this filesystem. - // - // This field is initialized at creation time and is immutable. - ino uint64 + // mapsMu protects mappings. + mapsMu sync.Mutex - // offsetMu protects offset. - offsetMu sync.Mutex - - // offset specifies the current file offset. - offset int64 + // If canMap is true, mappings tracks mappings of hostFD into + // memmap.MappingSpaces. + mappings memmap.MappingSet - // Event queue for blocking operations. - queue waiter.Queue + // pf implements platform.File for mappings of hostFD. + pf inodePlatformFile } // CheckPermissions implements kernfs.Inode. @@ -388,6 +394,21 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { return err } + oldSize := uint64(hostStat.Size) + if s.Size < oldSize { + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(s.Size) + if oldpgend != newpgend { + i.mapsMu.Lock() + i.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + i.mapsMu.Unlock() + } + } } if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { ts := [2]syscall.Timespec{ @@ -464,9 +485,6 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u return vfsfd, nil } - // For simplicity, set offset to 0. Technically, we should - // only set to 0 on files that are not seekable (sockets, pipes, etc.), - // and use the offset from the host fd otherwise. fd := &fileDescription{inode: i} vfsfd := &fd.vfsfd if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { @@ -487,6 +505,13 @@ type fileDescription struct { // // inode is immutable after fileDescription creation. inode *inode + + // offsetMu protects offset. + offsetMu sync.Mutex + + // offset specifies the current file offset. It is only meaningful when + // inode.seekable is true. + offset int64 } // SetStat implements vfs.FileDescriptionImpl. @@ -532,10 +557,10 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts return n, err } // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. - i.offsetMu.Lock() - n, err := readFromHostFD(ctx, i.hostFD, dst, i.offset, opts.Flags) - i.offset += n - i.offsetMu.Unlock() + f.offsetMu.Lock() + n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags) + f.offset += n + f.offsetMu.Unlock() return n, err } @@ -572,10 +597,10 @@ func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opt } // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. - i.offsetMu.Lock() - n, err := writeToHostFD(ctx, i.hostFD, src, i.offset, opts.Flags) - i.offset += n - i.offsetMu.Unlock() + f.offsetMu.Lock() + n, err := writeToHostFD(ctx, i.hostFD, src, f.offset, opts.Flags) + f.offset += n + f.offsetMu.Unlock() return n, err } @@ -600,41 +625,41 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i return 0, syserror.ESPIPE } - i.offsetMu.Lock() - defer i.offsetMu.Unlock() + f.offsetMu.Lock() + defer f.offsetMu.Unlock() switch whence { case linux.SEEK_SET: if offset < 0 { - return i.offset, syserror.EINVAL + return f.offset, syserror.EINVAL } - i.offset = offset + f.offset = offset case linux.SEEK_CUR: - // Check for overflow. Note that underflow cannot occur, since i.offset >= 0. - if offset > math.MaxInt64-i.offset { - return i.offset, syserror.EOVERFLOW + // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. + if offset > math.MaxInt64-f.offset { + return f.offset, syserror.EOVERFLOW } - if i.offset+offset < 0 { - return i.offset, syserror.EINVAL + if f.offset+offset < 0 { + return f.offset, syserror.EINVAL } - i.offset += offset + f.offset += offset case linux.SEEK_END: var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { - return i.offset, err + return f.offset, err } size := s.Size // Check for overflow. Note that underflow cannot occur, since size >= 0. if offset > math.MaxInt64-size { - return i.offset, syserror.EOVERFLOW + return f.offset, syserror.EOVERFLOW } if size+offset < 0 { - return i.offset, syserror.EINVAL + return f.offset, syserror.EINVAL } - i.offset = size + offset + f.offset = size + offset case linux.SEEK_DATA, linux.SEEK_HOLE: // Modifying the offset in the host file table should not matter, since @@ -643,16 +668,16 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i // For reading and writing, we always rely on our internal offset. n, err := unix.Seek(i.hostFD, offset, int(whence)) if err != nil { - return i.offset, err + return f.offset, err } - i.offset = n + f.offset = n default: // Invalid whence. - return i.offset, syserror.EINVAL + return f.offset, syserror.EINVAL } - return i.offset, nil + return f.offset, nil } // Sync implements FileDescriptionImpl. @@ -666,8 +691,9 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts if !f.inode.canMap { return syserror.ENODEV } - // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. - return syserror.ENODEV + i := f.inode + i.pf.fileMapperInitOnce.Do(i.pf.fileMapper.Init) + return vfs.GenericConfigureMMap(&f.vfsfd, i, opts) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go new file mode 100644 index 000000000..8545a82f0 --- /dev/null +++ b/pkg/sentry/fsimpl/host/mmap.go @@ -0,0 +1,132 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" +) + +// inodePlatformFile implements platform.File. It exists solely because inode +// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef. +// +// inodePlatformFile should only be used if inode.canMap is true. +type inodePlatformFile struct { + *inode + + // fdRefsMu protects fdRefs. + fdRefsMu sync.Mutex + + // fdRefs counts references on platform.File offsets. It is used solely for + // memory accounting. + fdRefs fsutil.FrameRefSet + + // fileMapper caches mappings of the host file represented by this inode. + fileMapper fsutil.HostFileMapper + + // fileMapperInitOnce is used to lazily initialize fileMapper. + fileMapperInitOnce sync.Once +} + +// IncRef implements platform.File.IncRef. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) IncRef(fr platform.FileRange) { + i.fdRefsMu.Lock() + i.fdRefs.IncRefAndAccount(fr) + i.fdRefsMu.Unlock() +} + +// DecRef implements platform.File.DecRef. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) DecRef(fr platform.FileRange) { + i.fdRefsMu.Lock() + i.fdRefs.DecRefAndAccount(fr) + i.fdRefsMu.Unlock() +} + +// MapInternal implements platform.File.MapInternal. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + return i.fileMapper.MapInternal(fr, i.hostFD, at.Write) +} + +// FD implements platform.File.FD. +func (i *inodePlatformFile) FD() int { + return i.hostFD +} + +// AddMapping implements memmap.Mappable.AddMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + i.mapsMu.Lock() + mapped := i.mappings.AddMapping(ms, ar, offset, writable) + for _, r := range mapped { + i.pf.fileMapper.IncRefOn(r) + } + i.mapsMu.Unlock() + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + i.mapsMu.Lock() + unmapped := i.mappings.RemoveMapping(ms, ar, offset, writable) + for _, r := range unmapped { + i.pf.fileMapper.DecRefOn(r) + } + i.mapsMu.Unlock() +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return i.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + mr := optional + return []memmap.Translation{ + { + Source: mr, + File: &i.pf, + Offset: mr.Start, + Perms: usermem.AnyAccess, + }, + }, nil +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) InvalidateUnsavable(ctx context.Context) error { + // We expect the same host fd across save/restore, so all translations + // should be valid. + return nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index a2d9649e7..007be1572 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -52,7 +52,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/lock", - "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", @@ -96,6 +95,7 @@ go_test( "pipe_test.go", "regular_file_test.go", "stat_test.go", + "tmpfs_test.go", ], library = ":tmpfs", deps = [ @@ -105,7 +105,6 @@ go_test( "//pkg/sentry/contexttest", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/contexttest", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 36ffcb592..80fa7b29d 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -16,6 +16,7 @@ package tmpfs import ( "fmt" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -24,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -76,8 +78,8 @@ afterSymlink: return nil, err } if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO(gvisor.dev/issue/1197): Symlink traversals updates - // access time. + // Symlink traversal updates access time. + atomic.StoreInt64(&d.inode.atime, d.inode.fs.clock.Now().Nanoseconds()) if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } @@ -361,8 +363,8 @@ afterTrailingSymlink: } // Do we need to resolve a trailing symlink? if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO(gvisor.dev/issue/1197): Symlink traversals updates - // access time. + // Symlink traversal updates access time. + atomic.StoreInt64(&child.inode.atime, child.inode.fs.clock.Now().Nanoseconds()) if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } @@ -636,12 +638,19 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) - if err != nil { + if _, err := resolveLocked(rp); err != nil { return linux.Statfs{}, err } - // TODO(gvisor.dev/issue/1197): Actually implement statfs. - return linux.Statfs{}, syserror.ENOSYS + statfs := linux.Statfs{ + Type: linux.TMPFS_MAGIC, + BlockSize: usermem.PageSize, + FragmentSize: usermem.PageSize, + NameLength: linux.NAME_MAX, + // TODO(b/29637826): Allow configuring a tmpfs size and enforce it. + Blocks: 0, + BlocksFree: 0, + } + return statfs, nil } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. @@ -763,5 +772,24 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) + mnt := vd.Mount() + d := vd.Dentry().Impl().(*dentry) + for { + if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { + return vfs.PrependPathAtVFSRootError{} + } + if &d.vfsd == mnt.Root() { + return nil + } + if d.parent == nil { + if d.name != "" { + // This must be an anonymous memfd file. + b.PrependComponent("/" + d.name) + return vfs.PrependPathSyntheticError{} + } + return vfs.PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 57e5e28ec..3f433d666 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -88,6 +88,7 @@ type regularFile struct { func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { file := ®ularFile{ memFile: fs.memFile, + seals: linux.F_SEAL_SEAL, } file.inode.init(file, fs, creds, linux.S_IFREG|mode) file.inode.nlink = 1 // from parent directory @@ -577,3 +578,44 @@ exitLoop: return done, retErr } + +// GetSeals returns the current set of seals on a memfd inode. +func GetSeals(fd *vfs.FileDescription) (uint32, error) { + f, ok := fd.Impl().(*regularFileFD) + if !ok { + return 0, syserror.EINVAL + } + rf := f.inode().impl.(*regularFile) + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + return rf.seals, nil +} + +// AddSeals adds new file seals to a memfd inode. +func AddSeals(fd *vfs.FileDescription, val uint32) error { + f, ok := fd.Impl().(*regularFileFD) + if !ok { + return syserror.EINVAL + } + rf := f.inode().impl.(*regularFile) + rf.mapsMu.Lock() + defer rf.mapsMu.Unlock() + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + + if rf.seals&linux.F_SEAL_SEAL != 0 { + // Seal applied which prevents addition of any new seals. + return syserror.EPERM + } + + // F_SEAL_WRITE can only be added if there are no active writable maps. + if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { + if rf.writableMappingPages > 0 { + return syserror.EBUSY + } + } + + // Seals can only be added, never removed. + rf.seals |= val + return nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 0399725cf..64e1c40ad 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -18,152 +18,16 @@ import ( "bytes" "fmt" "io" - "sync/atomic" "testing" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) -// nextFileID is used to generate unique file names. -var nextFileID int64 - -// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error -// is not nil, then cleanup should be called when the root is no longer needed. -func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { - creds := auth.CredentialsFromContext(ctx) - - vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { - return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) - } - - vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) - if err != nil { - return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) - } - root := mntns.Root() - return vfsObj, root, func() { - root.DecRef() - mntns.DecRef() - }, nil -} - -// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If -// the returned err is not nil, then cleanup should be called when the FD is no -// longer needed. -func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { - creds := auth.CredentialsFromContext(ctx) - vfsObj, root, cleanup, err := newTmpfsRoot(ctx) - if err != nil { - return nil, nil, err - } - - filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1)) - - // Create the file that will be write/read. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, - Mode: linux.ModeRegular | mode, - }) - if err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err) - } - - return fd, cleanup, nil -} - -// newDirFD is like newFileFD, but for directories. -func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { - creds := auth.CredentialsFromContext(ctx) - vfsObj, root, cleanup, err := newTmpfsRoot(ctx) - if err != nil { - return nil, nil, err - } - - dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1)) - - // Create the dir. - if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(dirname), - }, &vfs.MkdirOptions{ - Mode: linux.ModeDirectory | mode, - }); err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err) - } - - // Open the dir and return it. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(dirname), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY | linux.O_DIRECTORY, - }) - if err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err) - } - - return fd, cleanup, nil -} - -// newPipeFD is like newFileFD, but for pipes. -func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { - creds := auth.CredentialsFromContext(ctx) - vfsObj, root, cleanup, err := newTmpfsRoot(ctx) - if err != nil { - return nil, nil, err - } - - pipename := fmt.Sprintf("tmpfs-test-pipe-%d", atomic.AddInt64(&nextFileID, 1)) - - // Create the pipe. - if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(pipename), - }, &vfs.MknodOptions{ - Mode: linux.ModeNamedPipe | mode, - }); err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to create pipe %q: %v", pipename, err) - } - - // Open the pipe and return it. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(pipename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR, - }) - if err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to open pipe %q: %v", pipename, err) - } - - return fd, cleanup, nil -} - // Test that we can write some data to a file and read it back.` func TestSimpleWriteRead(t *testing.T) { ctx := contexttest.Context(t) diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go index 60c2c980e..f7ee4aab2 100644 --- a/pkg/sentry/fsimpl/tmpfs/stat_test.go +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -19,8 +19,8 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -29,7 +29,6 @@ func TestStatAfterCreate(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( @@ -175,7 +174,6 @@ func TestSetStat(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 405928bd0..1e781aecd 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -94,7 +94,7 @@ type FilesystemOpts struct { } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) if memFileProvider == nil { panic("MemoryFileProviderFromContext returned nil") @@ -139,6 +139,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return &fs.vfsfs, &root.vfsd, nil } +// NewFilesystem returns a new tmpfs filesystem. +func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*vfs.Filesystem, *vfs.Dentry, error) { + return FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "", vfs.GetFilesystemOptions{}) +} + // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) @@ -658,3 +663,34 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name) } + +// NewMemfd creates a new tmpfs regular file and file description that can back +// an anonymous fd created by memfd_create. +func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) { + fs, ok := mount.Filesystem().Impl().(*filesystem) + if !ok { + panic("NewMemfd() called with non-tmpfs mount") + } + + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with + // S_IRWXUGO. + mode := linux.FileMode(0777) + inode := fs.newRegularFile(creds, mode) + rf := inode.impl.(*regularFile) + if allowSeals { + rf.seals = 0 + } + + d := fs.newDentry(inode) + defer d.DecRef() + d.name = name + + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with + // FMODE_READ | FMODE_WRITE. + var fd regularFileFD + flags := uint32(linux.O_RDWR) + if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go new file mode 100644 index 000000000..a240fb276 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -0,0 +1,156 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// nextFileID is used to generate unique file names. +var nextFileID int64 + +// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error +// is not nil, then cleanup should be called when the root is no longer needed. +func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { + creds := auth.CredentialsFromContext(ctx) + + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) + } + + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) + } + root := mntns.Root() + return vfsObj, root, func() { + root.DecRef() + mntns.DecRef() + }, nil +} + +// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If +// the returned err is not nil, then cleanup should be called when the FD is no +// longer needed. +func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the file that will be write/read. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: linux.ModeRegular | mode, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err) + } + + return fd, cleanup, nil +} + +// newDirFD is like newFileFD, but for directories. +func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the dir. + if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.MkdirOptions{ + Mode: linux.ModeDirectory | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err) + } + + // Open the dir and return it. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err) + } + + return fd, cleanup, nil +} + +// newPipeFD is like newFileFD, but for pipes. +func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + name := fmt.Sprintf("tmpfs-test-%d", atomic.AddInt64(&nextFileID, 1)) + + if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + }, &vfs.MknodOptions{ + Mode: linux.ModeNamedPipe | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create pipe %q: %v", name, err) + } + + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open pipe %q: %v", name, err) + } + + return fd, cleanup, nil +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 8104f50f3..a28eab8b8 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -173,6 +173,7 @@ go_library( "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/fsimpl/timerfd", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 3617da8c6..5efeb3767 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -53,6 +53,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -259,6 +260,10 @@ type Kernel struct { // syscalls (as opposed to named pipes created by mknod()). pipeMount *vfs.Mount + // shmMount is the Mount used for anonymous files created by the + // memfd_create() syscalls. It is analagous to Linux's shm_mnt. + shmMount *vfs.Mount + // socketMount is the Mount used for sockets created by the socket() and // socketpair() syscalls. There are several cases where a socket dentry will // not be contained in socketMount: @@ -330,6 +335,9 @@ func (k *Kernel) Init(args InitKernelArgs) error { if args.Timekeeper == nil { return fmt.Errorf("Timekeeper is nil") } + if args.Timekeeper.clocks == nil { + return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()") + } if args.RootUserNamespace == nil { return fmt.Errorf("RootUserNamespace is nil") } @@ -384,6 +392,18 @@ func (k *Kernel) Init(args InitKernelArgs) error { } k.pipeMount = pipeMount + tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) + if err != nil { + return fmt.Errorf("failed to create tmpfs filesystem: %v", err) + } + defer tmpfsFilesystem.DecRef() + defer tmpfsRoot.DecRef() + shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create tmpfs mount: %v", err) + } + k.shmMount = shmMount + socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) if err != nil { return fmt.Errorf("failed to create sockfs filesystem: %v", err) @@ -1656,6 +1676,11 @@ func (k *Kernel) PipeMount() *vfs.Mount { return k.pipeMount } +// ShmMount returns the tmpfs mount. +func (k *Kernel) ShmMount() *vfs.Mount { + return k.shmMount +} + // SocketMount returns the sockfs mount. func (k *Kernel) SocketMount() *vfs.Mount { return k.socketMount diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index 5a1d4fd57..aacf28da2 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -144,7 +144,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume if v > math.MaxInt32 { v = math.MaxInt32 // Silently truncate. } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index c9db78e06..a5903b0b5 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -199,10 +199,10 @@ func (t *Task) doSyscall() taskRunState { // // On x86, register rax was shared by syscall number and return // value, and at the entry of the syscall handler, the rax was - // saved to regs.orig_rax which was exposed to user space. + // saved to regs.orig_rax which was exposed to userspace. // But on arm64, syscall number was passed through X8, and the X0 // was shared by the first syscall argument and return value. The - // X0 was saved to regs.orig_x0 which was not exposed to user space. + // X0 was saved to regs.orig_x0 which was not exposed to userspace. // So we have to do the same operation here to save the X0 value // into the task context. t.Arch().SyscallSaveOrig() diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index 73591dab7..a036ce53c 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -25,6 +25,7 @@ go_template_instance( out = "vma_set.go", consts = { "minDegree": "8", + "trackGaps": "1", }, imports = { "usermem": "gvisor.dev/gvisor/pkg/usermem", diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 9a14e69e6..16d8207e9 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -195,7 +195,7 @@ func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange { // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) { - for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextGap() { + for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(usermem.Addr(length)) { if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { // Can we shift up to match the alignment? if offset := uint64(gr.Start) % alignment; offset != 0 { @@ -214,7 +214,7 @@ func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bou // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) { - for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevGap() { + for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(usermem.Addr(length)) { if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { // Can we shift down to match the alignment? start := gr.End - usermem.Addr(length) diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go index 444a83913..a6345010d 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.go +++ b/pkg/sentry/platform/ring0/lib_arm64.go @@ -38,6 +38,12 @@ func SaveVRegs(*byte) // LoadVRegs loads V0-V31 registers. func LoadVRegs(*byte) +// GetTLS returns the value of TPIDR_EL0 register. +func GetTLS() (value uint64) + +// SetTLS writes the TPIDR_EL0 value. +func SetTLS(value uint64) + // Init sets function pointers based on architectural features. // // This must be called prior to using ring0. diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s index 0e6a6235b..b63e14b41 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.s +++ b/pkg/sentry/platform/ring0/lib_arm64.s @@ -15,6 +15,16 @@ #include "funcdata.h" #include "textflag.h" +TEXT ·GetTLS(SB),NOSPLIT,$0-8 + MRS TPIDR_EL0, R1 + MOVD R1, ret+0(FP) + RET + +TEXT ·SetTLS(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R1 + MSR R1, TPIDR_EL0 + RET + TEXT ·CPACREL1(SB),NOSPLIT,$0-8 WORD $0xd5381041 // MRS CPACR_EL1, R1 MOVD R1, ret+0(FP) diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index b49433326..c11e82c10 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -555,7 +555,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b if uint64(src.NumBytes()) != srcs.NumBytes() { return 0, nil } - if srcs.IsEmpty() { + if srcs.IsEmpty() && len(controlBuf) == 0 { return 0, nil } diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 9d032f052..60df51dae 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -1321,6 +1321,29 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return int32(time.Duration(v) / time.Second), nil + case linux.TCP_SYNCNT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil + + case linux.TCP_WINDOW_CLAMP: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil default: emitUnimplementedEventTCP(t, name) } @@ -1790,6 +1813,22 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)))) + case linux.TCP_SYNCNT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := usermem.ByteOrder.Uint32(optVal) + + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v))) + + case linux.TCP_WINDOW_CLAMP: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := usermem.ByteOrder.Uint32(optVal) + + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v))) + case linux.TCP_REPAIR_OPTIONS: t.Kernel().EmitUnimplementedEvent(t) @@ -2679,7 +2718,7 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) @@ -2748,7 +2787,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc if v > math.MaxInt32 { v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) @@ -2764,7 +2803,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index df0d0f461..39f2b79ec 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -16,7 +16,6 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -26,7 +25,6 @@ import ( // doSplice implements a blocking splice operation. func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) { - log.Infof("NLAC: doSplice opts: %+v", opts) if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) { return 0, syserror.EINVAL } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index c32f942fb..f882ef840 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -13,6 +13,7 @@ go_library( "fscontext.go", "getdents.go", "ioctl.go", + "memfd.go", "mmap.go", "path.go", "pipe.go", @@ -43,6 +44,7 @@ go_library( "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/signalfd", "//pkg/sentry/fsimpl/timerfd", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 8181d80f4..ca0f7fd1e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -17,6 +17,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" @@ -157,6 +158,15 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, syserror.EBADF } return uintptr(pipefile.PipeSize()), nil, nil + case linux.F_GET_SEALS: + val, err := tmpfs.GetSeals(file) + return uintptr(val), nil, err + case linux.F_ADD_SEALS: + if !file.IsWritable() { + return 0, nil, syserror.EPERM + } + err := tmpfs.AddSeals(file, args[2].Uint()) + return 0, nil, err default: // TODO(gvisor.dev/issue/1623): Everything else is not yet supported. return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go new file mode 100644 index 000000000..bbe248d17 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + memfdPrefix = "memfd:" + memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING) +) + +// MemfdCreate implements the linux syscall memfd_create(2). +func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Uint() + + if flags&^memfdAllFlags != 0 { + // Unknown bits in flags. + return 0, nil, syserror.EINVAL + } + + allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 + cloExec := flags&linux.MFD_CLOEXEC != 0 + + name, err := t.CopyInString(addr, memfdMaxNameLen) + if err != nil { + return 0, nil, err + } + + shmMount := t.Kernel().ShmMount() + file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name) + if err != nil { + return 0, nil, err + } + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: cloExec, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 4e61f1452..09ecfed26 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -246,73 +246,104 @@ func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } - opts := vfs.SetStatOptions{ - Stat: linux.Statx{ - Mask: linux.STATX_ATIME | linux.STATX_MTIME, - }, - } - if timesAddr == 0 { - opts.Stat.Atime.Nsec = linux.UTIME_NOW - opts.Stat.Mtime.Nsec = linux.UTIME_NOW - } else { - var times [2]linux.Timeval - if _, err := t.CopyIn(timesAddr, ×); err != nil { - return 0, nil, err - } - opts.Stat.Atime = linux.StatxTimestamp{ - Sec: times[0].Sec, - Nsec: uint32(times[0].Usec * 1000), - } - opts.Stat.Mtime = linux.StatxTimestamp{ - Sec: times[1].Sec, - Nsec: uint32(times[1].Usec * 1000), - } + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { + return 0, nil, err } return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) } -// Utimensat implements Linux syscall utimensat(2). -func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { +// Futimesat implements Linux syscall futimesat(2). +func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() pathAddr := args[1].Pointer() timesAddr := args[2].Pointer() - flags := args[3].Int() - if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { - return 0, nil, syserror.EINVAL - } - - path, err := copyInPath(t, pathAddr) - if err != nil { - return 0, nil, err + // "If filename is NULL and dfd refers to an open file, then operate on the + // file. Otherwise look up filename, possibly using dfd as a starting + // point." - fs/utimes.c + var path fspath.Path + shouldAllowEmptyPath := allowEmptyPath + if dirfd == linux.AT_FDCWD || pathAddr != 0 { + var err error + path, err = copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + shouldAllowEmptyPath = disallowEmptyPath } var opts vfs.SetStatOptions - if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { + if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { return 0, nil, err } - return 0, nil, setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &opts) + return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, followFinalSymlink, &opts) } -// Futimens implements Linux syscall futimens(2). -func Futimens(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := args[0].Int() - timesAddr := args[1].Pointer() - - file := t.GetFileVFS2(fd) - if file == nil { - return 0, nil, syserror.EBADF +func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error { + if timesAddr == 0 { + opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + return nil } - defer file.DecRef() + var times [2]linux.Timeval + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return err + } + if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 { + return syserror.EINVAL + } + opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME + opts.Stat.Atime = linux.StatxTimestamp{ + Sec: times[0].Sec, + Nsec: uint32(times[0].Usec * 1000), + } + opts.Stat.Mtime = linux.StatxTimestamp{ + Sec: times[1].Sec, + Nsec: uint32(times[1].Usec * 1000), + } + return nil +} +// Utimensat implements Linux syscall utimensat(2). +func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + timesAddr := args[2].Pointer() + flags := args[3].Int() + + // Linux requires that the UTIME_OMIT check occur before checking path or + // flags. var opts vfs.SetStatOptions if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { return 0, nil, err } + if opts.Stat.Mask == 0 { + return 0, nil, nil + } - return 0, nil, file.SetStat(t, opts) + if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { + return 0, nil, syserror.EINVAL + } + + // "If filename is NULL and dfd refers to an open file, then operate on the + // file. Otherwise look up filename, possibly using dfd as a starting + // point." - fs/utimes.c + var path fspath.Path + shouldAllowEmptyPath := allowEmptyPath + if dirfd == linux.AT_FDCWD || pathAddr != 0 { + var err error + path, err = copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + shouldAllowEmptyPath = disallowEmptyPath + } + + return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) } func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error { @@ -327,6 +358,9 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op return err } if times[0].Nsec != linux.UTIME_OMIT { + if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) { + return syserror.EINVAL + } opts.Stat.Mask |= linux.STATX_ATIME opts.Stat.Atime = linux.StatxTimestamp{ Sec: times[0].Sec, @@ -334,6 +368,9 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op } } if times[1].Nsec != linux.UTIME_OMIT { + if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) { + return syserror.EINVAL + } opts.Stat.Mask |= linux.STATX_MTIME opts.Stat.Mtime = linux.StatxTimestamp{ Sec: times[1].Sec, diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 9c04677f1..a332d01bd 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -123,7 +123,7 @@ func Override() { s.Table[258] = syscalls.Supported("mkdirat", Mkdirat) s.Table[259] = syscalls.Supported("mknodat", Mknodat) s.Table[260] = syscalls.Supported("fchownat", Fchownat) - s.Table[261] = syscalls.Supported("futimens", Futimens) + s.Table[261] = syscalls.Supported("futimesat", Futimesat) s.Table[262] = syscalls.Supported("newfstatat", Newfstatat) s.Table[263] = syscalls.Supported("unlinkat", Unlinkat) s.Table[264] = syscalls.Supported("renameat", Renameat) @@ -158,7 +158,7 @@ func Override() { s.Table[306] = syscalls.Supported("syncfs", Syncfs) s.Table[307] = syscalls.Supported("sendmmsg", SendMMsg) s.Table[316] = syscalls.Supported("renameat2", Renameat2) - delete(s.Table, 319) // memfd_create + s.Table[319] = syscalls.Supported("memfd_create", MemfdCreate) s.Table[322] = syscalls.Supported("execveat", Execveat) s.Table[327] = syscalls.Supported("preadv2", Preadv2) s.Table[328] = syscalls.Supported("pwritev2", Pwritev2) diff --git a/pkg/state/BUILD b/pkg/state/BUILD index 921af9d63..2b1350135 100644 --- a/pkg/state/BUILD +++ b/pkg/state/BUILD @@ -47,6 +47,7 @@ go_library( "state.go", "stats.go", ], + marshal = False, stateify = False, visibility = ["//:sandbox"], deps = [ diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD index 0e35d7d17..d0d77e19c 100644 --- a/pkg/sync/BUILD +++ b/pkg/sync/BUILD @@ -39,6 +39,8 @@ go_library( "seqcount.go", "sync.go", ], + marshal = False, + stateify = False, ) go_test( diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go index 29454c4b9..4c6f808e5 100644 --- a/pkg/tcpip/header/tcp.go +++ b/pkg/tcpip/header/tcp.go @@ -66,6 +66,14 @@ const ( TCPOptionSACK = 5 ) +// Option Lengths. +const ( + TCPOptionMSSLength = 4 + TCPOptionTSLength = 10 + TCPOptionWSLength = 3 + TCPOptionSackPermittedLength = 2 +) + // TCPFields contains the fields of a TCP packet. It is used to describe the // fields of a packet that needs to be encoded. type TCPFields struct { @@ -494,14 +502,11 @@ func ParseTCPOptions(b []byte) TCPOptions { // returns without encoding anything. It returns the number of bytes written to // the provided buffer. func EncodeMSSOption(mss uint32, b []byte) int { - // mssOptionSize is the number of bytes in a valid MSS option. - const mssOptionSize = 4 - - if len(b) < mssOptionSize { + if len(b) < TCPOptionMSSLength { return 0 } - b[0], b[1], b[2], b[3] = TCPOptionMSS, mssOptionSize, byte(mss>>8), byte(mss) - return mssOptionSize + b[0], b[1], b[2], b[3] = TCPOptionMSS, TCPOptionMSSLength, byte(mss>>8), byte(mss) + return TCPOptionMSSLength } // EncodeWSOption encodes the WS TCP option with the WS value in the @@ -509,10 +514,10 @@ func EncodeMSSOption(mss uint32, b []byte) int { // returns without encoding anything. It returns the number of bytes written to // the provided buffer. func EncodeWSOption(ws int, b []byte) int { - if len(b) < 3 { + if len(b) < TCPOptionWSLength { return 0 } - b[0], b[1], b[2] = TCPOptionWS, 3, uint8(ws) + b[0], b[1], b[2] = TCPOptionWS, TCPOptionWSLength, uint8(ws) return int(b[1]) } @@ -521,10 +526,10 @@ func EncodeWSOption(ws int, b []byte) int { // just returns without encoding anything. It returns the number of bytes // written to the provided buffer. func EncodeTSOption(tsVal, tsEcr uint32, b []byte) int { - if len(b) < 10 { + if len(b) < TCPOptionTSLength { return 0 } - b[0], b[1] = TCPOptionTS, 10 + b[0], b[1] = TCPOptionTS, TCPOptionTSLength binary.BigEndian.PutUint32(b[2:], tsVal) binary.BigEndian.PutUint32(b[6:], tsEcr) return int(b[1]) @@ -535,11 +540,11 @@ func EncodeTSOption(tsVal, tsEcr uint32, b []byte) int { // encoding anything. It returns the number of bytes written to the provided // buffer. func EncodeSACKPermittedOption(b []byte) int { - if len(b) < 2 { + if len(b) < TCPOptionSackPermittedLength { return 0 } - b[0], b[1] = TCPOptionSACKPermitted, 2 + b[0], b[1] = TCPOptionSACKPermitted, TCPOptionSackPermittedLength return int(b[1]) } diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index b39ffa9fb..0ab4c3e19 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -235,11 +235,11 @@ type RcvBufAutoTuneParams struct { // was started. MeasureTime time.Time - // CopiedBytes is the number of bytes copied to user space since + // CopiedBytes is the number of bytes copied to userspace since // this measure began. CopiedBytes int - // PrevCopiedBytes is the number of bytes copied to user space in + // PrevCopiedBytes is the number of bytes copied to userspace in // the previous RTT period. PrevCopiedBytes int diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 1ca4088c9..b7b227328 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -110,6 +110,71 @@ var ( ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"} ) +var messageToError map[string]*Error + +var populate sync.Once + +// StringToError converts an error message to the error. +func StringToError(s string) *Error { + populate.Do(func() { + var errors = []*Error{ + ErrUnknownProtocol, + ErrUnknownNICID, + ErrUnknownDevice, + ErrUnknownProtocolOption, + ErrDuplicateNICID, + ErrDuplicateAddress, + ErrNoRoute, + ErrBadLinkEndpoint, + ErrAlreadyBound, + ErrInvalidEndpointState, + ErrAlreadyConnecting, + ErrAlreadyConnected, + ErrNoPortAvailable, + ErrPortInUse, + ErrBadLocalAddress, + ErrClosedForSend, + ErrClosedForReceive, + ErrWouldBlock, + ErrConnectionRefused, + ErrTimeout, + ErrAborted, + ErrConnectStarted, + ErrDestinationRequired, + ErrNotSupported, + ErrQueueSizeNotSupported, + ErrNotConnected, + ErrConnectionReset, + ErrConnectionAborted, + ErrNoSuchFile, + ErrInvalidOptionValue, + ErrNoLinkAddress, + ErrBadAddress, + ErrNetworkUnreachable, + ErrMessageTooLong, + ErrNoBufferSpace, + ErrBroadcastDisabled, + ErrNotPermitted, + ErrAddressFamilyNotSupported, + } + + messageToError = make(map[string]*Error) + for _, e := range errors { + if messageToError[e.String()] != nil { + panic("tcpip errors with duplicated message: " + e.String()) + } + messageToError[e.String()] = e + } + }) + + e, ok := messageToError[s] + if !ok { + panic("unknown error message: " + s) + } + + return e +} + // Errors related to Subnet var ( errSubnetLengthMismatch = errors.New("subnet length of address and mask differ") @@ -622,6 +687,19 @@ const ( // // A zero value indicates the default. TTLOption + + // TCPSynCountOption is used by SetSockOpt/GetSockOpt to specify the number of + // SYN retransmits that TCP should send before aborting the attempt to + // connect. It cannot exceed 255. + // + // NOTE: This option is currently only stubbed out and is no-op. + TCPSynCountOption + + // TCPWindowClampOption is used by SetSockOpt/GetSockOpt to bound the size + // of the advertised window to this value. + // + // NOTE: This option is currently only stubed out and is a no-op + TCPWindowClampOption ) // ErrorOption is used in GetSockOpt to specify that the last error reported by @@ -685,11 +763,23 @@ type TCPDeferAcceptOption time.Duration // default MinRTO used by the Stack. type TCPMinRTOOption time.Duration +// TCPMaxRTOOption is use by SetSockOpt/GetSockOpt to allow overriding +// default MaxRTO used by the Stack. +type TCPMaxRTOOption time.Duration + +// TCPMaxRetriesOption is used by SetSockOpt/GetSockOpt to set/get the +// maximum number of retransmits after which we time out the connection. +type TCPMaxRetriesOption uint64 + // TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify // the number of endpoints that can be in SYN-RCVD state before the stack // switches to using SYN cookies. type TCPSynRcvdCountThresholdOption uint64 +// TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide +// default for number of times SYN is retransmitted before aborting a connect. +type TCPSynRetriesOption uint8 + // MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a // default interface for multicast. type MulticastInterfaceOption struct { diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 07d3e64c8..b5ba972f1 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -470,6 +470,17 @@ type endpoint struct { // for this endpoint using the TCP_MAXSEG setsockopt. userMSS uint16 + // maxSynRetries is the maximum number of SYN retransmits that TCP should + // send before aborting the attempt to connect. It cannot exceed 255. + // + // NOTE: This is currently a no-op and does not change the SYN + // retransmissions. + maxSynRetries uint8 + + // windowClamp is used to bound the size of the advertised window to + // this value. + windowClamp uint32 + // The following fields are used to manage the send buffer. When // segments are ready to be sent, they are added to sndQueue and the // protocol goroutine is signaled via sndWaker. @@ -795,8 +806,10 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue interval: 75 * time.Second, count: 9, }, - uniqueID: s.UniqueID(), - txHash: s.Rand().Uint32(), + uniqueID: s.UniqueID(), + txHash: s.Rand().Uint32(), + windowClamp: DefaultReceiveBufferSize, + maxSynRetries: DefaultSynRetries, } var ss SendBufferSizeOption @@ -829,6 +842,11 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue e.tcpLingerTimeout = time.Duration(tcpLT) } + var synRetries tcpip.TCPSynRetriesOption + if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil { + e.maxSynRetries = uint8(synRetries) + } + if p := s.GetTCPProbe(); p != nil { e.probe = p } @@ -1079,7 +1097,7 @@ func (e *endpoint) initialReceiveWindow() int { } // ModerateRecvBuf adjusts the receive buffer and the advertised window -// based on the number of bytes copied to user space. +// based on the number of bytes copied to userspace. func (e *endpoint) ModerateRecvBuf(copied int) { e.LockUser() defer e.UnlockUser() @@ -1603,6 +1621,36 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { e.ttl = uint8(v) e.UnlockUser() + case tcpip.TCPSynCountOption: + if v < 1 || v > 255 { + return tcpip.ErrInvalidOptionValue + } + e.LockUser() + e.maxSynRetries = uint8(v) + e.UnlockUser() + + case tcpip.TCPWindowClampOption: + if v == 0 { + e.LockUser() + switch e.EndpointState() { + case StateClose, StateInitial: + e.windowClamp = 0 + e.UnlockUser() + return nil + default: + e.UnlockUser() + return tcpip.ErrInvalidOptionValue + } + } + var rs ReceiveBufferSizeOption + if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { + if v < rs.Min/2 { + v = rs.Min / 2 + } + } + e.LockUser() + e.windowClamp = uint32(v) + e.UnlockUser() } return nil } @@ -1826,6 +1874,18 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { e.UnlockUser() return v, nil + case tcpip.TCPSynCountOption: + e.LockUser() + v := int(e.maxSynRetries) + e.UnlockUser() + return v, nil + + case tcpip.TCPWindowClampOption: + e.LockUser() + v := int(e.windowClamp) + e.UnlockUser() + return v, nil + default: return -1, tcpip.ErrUnknownProtocolOption } diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index 8b7562396..fc43c11e2 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -314,7 +314,7 @@ func (e *endpoint) loadLastError(s string) { return } - e.lastError = loadError(s) + e.lastError = tcpip.StringToError(s) } // saveHardError is invoked by stateify. @@ -332,71 +332,7 @@ func (e *EndpointInfo) loadHardError(s string) { return } - e.HardError = loadError(s) -} - -var messageToError map[string]*tcpip.Error - -var populate sync.Once - -func loadError(s string) *tcpip.Error { - populate.Do(func() { - var errors = []*tcpip.Error{ - tcpip.ErrUnknownProtocol, - tcpip.ErrUnknownNICID, - tcpip.ErrUnknownDevice, - tcpip.ErrUnknownProtocolOption, - tcpip.ErrDuplicateNICID, - tcpip.ErrDuplicateAddress, - tcpip.ErrNoRoute, - tcpip.ErrBadLinkEndpoint, - tcpip.ErrAlreadyBound, - tcpip.ErrInvalidEndpointState, - tcpip.ErrAlreadyConnecting, - tcpip.ErrAlreadyConnected, - tcpip.ErrNoPortAvailable, - tcpip.ErrPortInUse, - tcpip.ErrBadLocalAddress, - tcpip.ErrClosedForSend, - tcpip.ErrClosedForReceive, - tcpip.ErrWouldBlock, - tcpip.ErrConnectionRefused, - tcpip.ErrTimeout, - tcpip.ErrAborted, - tcpip.ErrConnectStarted, - tcpip.ErrDestinationRequired, - tcpip.ErrNotSupported, - tcpip.ErrQueueSizeNotSupported, - tcpip.ErrNotConnected, - tcpip.ErrConnectionReset, - tcpip.ErrConnectionAborted, - tcpip.ErrNoSuchFile, - tcpip.ErrInvalidOptionValue, - tcpip.ErrNoLinkAddress, - tcpip.ErrBadAddress, - tcpip.ErrNetworkUnreachable, - tcpip.ErrMessageTooLong, - tcpip.ErrNoBufferSpace, - tcpip.ErrBroadcastDisabled, - tcpip.ErrNotPermitted, - tcpip.ErrAddressFamilyNotSupported, - } - - messageToError = make(map[string]*tcpip.Error) - for _, e := range errors { - if messageToError[e.String()] != nil { - panic("tcpip errors with duplicated message: " + e.String()) - } - messageToError[e.String()] = e - } - }) - - e, ok := messageToError[s] - if !ok { - panic("unknown error message: " + s) - } - - return e + e.HardError = tcpip.StringToError(s) } // saveMeasureTime is invoked by stateify. diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index cfd9a4e8e..2a2a7ddeb 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -64,6 +64,10 @@ const ( // DefaultTCPTimeWaitTimeout is the amount of time that sockets linger // in TIME_WAIT state before being marked closed. DefaultTCPTimeWaitTimeout = 60 * time.Second + + // DefaultSynRetries is the default value for the number of SYN retransmits + // before a connect is aborted. + DefaultSynRetries = 6 ) // SACKEnabled option can be used to enable SACK support in the TCP @@ -163,7 +167,10 @@ type protocol struct { tcpLingerTimeout time.Duration tcpTimeWaitTimeout time.Duration minRTO time.Duration + maxRTO time.Duration + maxRetries uint32 synRcvdCount synRcvdCounter + synRetries uint8 dispatcher *dispatcher } @@ -340,12 +347,36 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error { p.mu.Unlock() return nil + case tcpip.TCPMaxRTOOption: + if v < 0 { + v = tcpip.TCPMaxRTOOption(MaxRTO) + } + p.mu.Lock() + p.maxRTO = time.Duration(v) + p.mu.Unlock() + return nil + + case tcpip.TCPMaxRetriesOption: + p.mu.Lock() + p.maxRetries = uint32(v) + p.mu.Unlock() + return nil + case tcpip.TCPSynRcvdCountThresholdOption: p.mu.Lock() p.synRcvdCount.SetThreshold(uint64(v)) p.mu.Unlock() return nil + case tcpip.TCPSynRetriesOption: + if v < 1 || v > 255 { + return tcpip.ErrInvalidOptionValue + } + p.mu.Lock() + p.synRetries = uint8(v) + p.mu.Unlock() + return nil + default: return tcpip.ErrUnknownProtocolOption } @@ -414,12 +445,30 @@ func (p *protocol) Option(option interface{}) *tcpip.Error { p.mu.RUnlock() return nil + case *tcpip.TCPMaxRTOOption: + p.mu.RLock() + *v = tcpip.TCPMaxRTOOption(p.maxRTO) + p.mu.RUnlock() + return nil + + case *tcpip.TCPMaxRetriesOption: + p.mu.RLock() + *v = tcpip.TCPMaxRetriesOption(p.maxRetries) + p.mu.RUnlock() + return nil + case *tcpip.TCPSynRcvdCountThresholdOption: p.mu.RLock() *v = tcpip.TCPSynRcvdCountThresholdOption(p.synRcvdCount.Threshold()) p.mu.RUnlock() return nil + case *tcpip.TCPSynRetriesOption: + p.mu.RLock() + *v = tcpip.TCPSynRetriesOption(p.synRetries) + p.mu.RUnlock() + return nil + default: return tcpip.ErrUnknownProtocolOption } @@ -452,6 +501,9 @@ func NewProtocol() stack.TransportProtocol { tcpTimeWaitTimeout: DefaultTCPTimeWaitTimeout, synRcvdCount: synRcvdCounter{threshold: SynRcvdCountThreshold}, dispatcher: newDispatcher(runtime.GOMAXPROCS(0)), + synRetries: DefaultSynRetries, minRTO: MinRTO, + maxRTO: MaxRTO, + maxRetries: MaxRetries, } } diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 9e547a221..06dc9b7d7 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -43,7 +43,8 @@ const ( nDupAckThreshold = 3 // MaxRetries is the maximum number of probe retries sender does - // before timing out the connection, Linux default TCP_RETR2. + // before timing out the connection. + // Linux default TCP_RETR2, net.ipv4.tcp_retries2. MaxRetries = 15 ) @@ -165,6 +166,12 @@ type sender struct { // minRTO is the minimum permitted value for sender.rto. minRTO time.Duration + // maxRTO is the maximum permitted value for sender.rto. + maxRTO time.Duration + + // maxRetries is the maximum permitted retransmissions. + maxRetries uint32 + // maxPayloadSize is the maximum size of the payload of a given segment. // It is initialized on demand. maxPayloadSize int @@ -276,12 +283,24 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint // etc. s.ep.scoreboard = NewSACKScoreboard(uint16(s.maxPayloadSize), iss) - // Get Stack wide minRTO. - var v tcpip.TCPMinRTOOption - if err := ep.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { + // Get Stack wide config. + var minRTO tcpip.TCPMinRTOOption + if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil { panic(fmt.Sprintf("unable to get minRTO from stack: %s", err)) } - s.minRTO = time.Duration(v) + s.minRTO = time.Duration(minRTO) + + var maxRTO tcpip.TCPMaxRTOOption + if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil { + panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err)) + } + s.maxRTO = time.Duration(maxRTO) + + var maxRetries tcpip.TCPMaxRetriesOption + if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil { + panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err)) + } + s.maxRetries = uint32(maxRetries) return s } @@ -485,7 +504,7 @@ func (s *sender) retransmitTimerExpired() bool { } elapsed := time.Since(s.firstRetransmittedSegXmitTime) - remaining := MaxRTO + remaining := s.maxRTO if uto != 0 { // Cap to the user specified timeout if one is specified. remaining = uto - elapsed @@ -494,24 +513,17 @@ func (s *sender) retransmitTimerExpired() bool { // Always honor the user-timeout irrespective of whether the zero // window probes were acknowledged. // net/ipv4/tcp_timer.c::tcp_probe_timer() - if remaining <= 0 || s.unackZeroWindowProbes >= MaxRetries { + if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries { return false } - if s.rto >= MaxRTO { - // RFC 1122 section: 4.2.2.17 - // A TCP MAY keep its offered receive window closed - // indefinitely. As long as the receiving TCP continues to - // send acknowledgments in response to the probe segments, the - // sending TCP MUST allow the connection to stay open. - if !(s.zeroWindowProbing && s.unackZeroWindowProbes == 0) { - return false - } - } - // Set new timeout. The timer will be restarted by the call to sendData // below. s.rto *= 2 + // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5 + if s.rto > s.maxRTO { + s.rto = s.maxRTO + } // Cap RTO to remaining time. if s.rto > remaining { @@ -565,9 +577,20 @@ func (s *sender) retransmitTimerExpired() bool { // send. if s.zeroWindowProbing { s.sendZeroWindowProbe() + // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed + // indefinitely. As long as the receiving TCP continues to send + // acknowledgments in response to the probe segments, the sending TCP + // MUST allow the connection to stay open. return true } + seg := s.writeNext + // RFC 1122 4.2.3.5: Close the connection when the number of + // retransmissions for this segment is beyond a limit. + if seg != nil && seg.xmitCount > s.maxRetries { + return false + } + s.sendData() return true diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index d2c90ebd5..6ef32a1b3 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -2994,6 +2994,101 @@ func TestSendOnResetConnection(t *testing.T) { } } +// TestMaxRetransmitsTimeout tests if the connection is timed out after +// a segment has been retransmitted MaxRetries times. +func TestMaxRetransmitsTimeout(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + const numRetries = 2 + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRetriesOption(numRetries)); err != nil { + t.Fatalf("could not set protocol option MaxRetries.\n") + } + + c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */) + + waitEntry, notifyCh := waiter.NewChannelEntry(nil) + c.WQ.EventRegister(&waitEntry, waiter.EventHUp) + defer c.WQ.EventUnregister(&waitEntry) + + _, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{}) + if err != nil { + t.Fatalf("Write failed: %v", err) + } + + // Expect first transmit and MaxRetries retransmits. + for i := 0; i < numRetries+1; i++ { + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagAck|header.TCPFlagPsh), + ), + ) + } + // Wait for the connection to timeout after MaxRetries retransmits. + initRTO := 1 * time.Second + select { + case <-notifyCh: + case <-time.After((2 << numRetries) * initRTO): + t.Fatalf("connection still alive after maximum retransmits.\n") + } + + // Send an ACK and expect a RST as the connection would have been closed. + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: c.Port, + Flags: header.TCPFlagAck, + }) + + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagRst), + ), + ) + + if got := c.Stack().Stats().TCP.EstablishedTimedout.Value(); got != 1 { + t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %v, want = 1", got) + } +} + +// TestMaxRTO tests if the retransmit interval caps to MaxRTO. +func TestMaxRTO(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + rto := 1 * time.Second + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRTOOption(rto)); err != nil { + t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPMaxRTO(%d) failed: %s", rto, err) + } + + c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */) + + _, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{}) + if err != nil { + t.Fatalf("Write failed: %v", err) + } + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)), + ), + ) + const numRetransmits = 2 + for i := 0; i < numRetransmits; i++ { + start := time.Now() + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)), + ), + ) + if time.Since(start).Round(time.Second).Seconds() != rto.Seconds() { + t.Errorf("Retransmit interval not capped to MaxRTO.\n") + } + } +} + func TestFinImmediately(t *testing.T) { c := context.New(t, defaultMTU) defer c.Cleanup() @@ -5774,7 +5869,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) { // Invoke the moderation API. This is required for auto-tuning // to happen. This method is normally expected to be invoked // from a higher layer than tcpip.Endpoint. So we simulate - // copying to user-space by invoking it explicitly here. + // copying to userspace by invoking it explicitly here. c.EP.ModerateRecvBuf(totalCopied) // Now send a keep-alive packet to trigger an ACK so that we can @@ -6605,9 +6700,16 @@ func TestTCPUserTimeout(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) + waitEntry, notifyCh := waiter.NewChannelEntry(nil) + c.WQ.EventRegister(&waitEntry, waiter.EventHUp) + defer c.WQ.EventUnregister(&waitEntry) + origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value() - userTimeout := 50 * time.Millisecond + // Ensure that on the next retransmit timer fire, the user timeout has + // expired. + initRTO := 1 * time.Second + userTimeout := initRTO / 2 c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout)) // Send some data and wait before ACKing it. @@ -6627,9 +6729,13 @@ func TestTCPUserTimeout(t *testing.T) { ), ) - // Wait for a little over the minimum retransmit timeout of 200ms for - // the retransmitTimer to fire and close the connection. - time.Sleep(tcp.MinRTO + 10*time.Millisecond) + // Wait for the retransmit timer to be fired and the user timeout to cause + // close of the connection. + select { + case <-notifyCh: + case <-time.After(2 * initRTO): + t.Fatalf("connection still alive after %s, should have been closed after :%s", 2*initRTO, userTimeout) + } // No packet should be received as the connection should be silently // closed due to timeout. diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 756ab913a..647b2067a 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -106,6 +106,9 @@ type endpoint struct { bindToDevice tcpip.NICID broadcast bool + lastErrorMu sync.Mutex `state:"nosave"` + lastError *tcpip.Error `state:".(string)"` + // Values used to reserve a port or register a transport endpoint. // (which ever happens first). boundBindToDevice tcpip.NICID @@ -188,6 +191,15 @@ func (e *endpoint) UniqueID() uint64 { return e.uniqueID } +func (e *endpoint) takeLastError() *tcpip.Error { + e.lastErrorMu.Lock() + defer e.lastErrorMu.Unlock() + + err := e.lastError + e.lastError = nil + return err +} + // Abort implements stack.TransportEndpoint.Abort. func (e *endpoint) Abort() { e.Close() @@ -243,6 +255,10 @@ func (e *endpoint) IPTables() (stack.IPTables, error) { // Read reads data from the endpoint. This method does not block if // there is no data pending. func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { + if err := e.takeLastError(); err != nil { + return buffer.View{}, tcpip.ControlMessages{}, err + } + e.rcvMu.Lock() if e.rcvList.Empty() { @@ -382,6 +398,10 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) { + if err := e.takeLastError(); err != nil { + return 0, nil, err + } + // MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.) if opts.More { return 0, nil, tcpip.ErrInvalidOptionValue @@ -853,6 +873,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { switch o := opt.(type) { case tcpip.ErrorOption: + return e.takeLastError() case *tcpip.MulticastInterfaceOption: e.mu.Lock() *o = tcpip.MulticastInterfaceOption{ @@ -1316,6 +1337,17 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket. func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) { + if typ == stack.ControlPortUnreachable { + e.mu.RLock() + defer e.mu.RUnlock() + + if e.state == StateConnected { + e.lastErrorMu.Lock() + defer e.lastErrorMu.Unlock() + + e.lastError = tcpip.ErrConnectionRefused + } + } } // State implements tcpip.Endpoint.State. diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go index 466bd9381..851e6b635 100644 --- a/pkg/tcpip/transport/udp/endpoint_state.go +++ b/pkg/tcpip/transport/udp/endpoint_state.go @@ -37,6 +37,24 @@ func (u *udpPacket) loadData(data buffer.VectorisedView) { u.data = data } +// saveLastError is invoked by stateify. +func (e *endpoint) saveLastError() string { + if e.lastError == nil { + return "" + } + + return e.lastError.String() +} + +// loadLastError is invoked by stateify. +func (e *endpoint) loadLastError(s string) { + if s == "" { + return + } + + e.lastError = tcpip.StringToError(s) +} + // beforeSave is invoked by stateify. func (e *endpoint) beforeSave() { // Stop incoming packets from being handled (and mutate endpoint state). diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go index 5f2af9f3b..c45d2ecbc 100644 --- a/pkg/test/dockerutil/dockerutil.go +++ b/pkg/test/dockerutil/dockerutil.go @@ -148,6 +148,62 @@ func (m MountMode) String() string { panic(fmt.Sprintf("invalid mode: %d", m)) } +// DockerNetwork contains the name of a docker network. +type DockerNetwork struct { + logger testutil.Logger + Name string + Subnet *net.IPNet + containers []*Docker +} + +// NewDockerNetwork sets up the struct for a Docker network. Names of networks +// will be unique. +func NewDockerNetwork(logger testutil.Logger) *DockerNetwork { + return &DockerNetwork{ + logger: logger, + Name: testutil.RandomID(logger.Name()), + } +} + +// Create calls 'docker network create'. +func (n *DockerNetwork) Create(args ...string) error { + a := []string{"docker", "network", "create"} + if n.Subnet != nil { + a = append(a, fmt.Sprintf("--subnet=%s", n.Subnet)) + } + a = append(a, args...) + a = append(a, n.Name) + return testutil.Command(n.logger, a...).Run() +} + +// Connect calls 'docker network connect' with the arguments provided. +func (n *DockerNetwork) Connect(container *Docker, args ...string) error { + a := []string{"docker", "network", "connect"} + a = append(a, args...) + a = append(a, n.Name, container.Name) + if err := testutil.Command(n.logger, a...).Run(); err != nil { + return err + } + n.containers = append(n.containers, container) + return nil +} + +// Cleanup cleans up the docker network and all the containers attached to it. +func (n *DockerNetwork) Cleanup() error { + for _, c := range n.containers { + // Don't propagate the error, it might be that the container + // was already cleaned up. + if err := c.Kill(); err != nil { + n.logger.Logf("unable to kill container during cleanup: %s", err) + } + } + + if err := testutil.Command(n.logger, "docker", "network", "rm", n.Name).Run(); err != nil { + return err + } + return nil +} + // Docker contains the name and the runtime of a docker container. type Docker struct { logger testutil.Logger @@ -162,9 +218,13 @@ type Docker struct { // // Names of containers will be unique. func MakeDocker(logger testutil.Logger) *Docker { + // Slashes are not allowed in container names. + name := testutil.RandomID(logger.Name()) + name = strings.ReplaceAll(name, "/", "-") + return &Docker{ logger: logger, - Name: testutil.RandomID(logger.Name()), + Name: name, Runtime: *runtime, } } @@ -309,7 +369,9 @@ func (d *Docker) argsFor(r *RunOpts, command string, p []string) (rv []string) { rv = append(rv, d.Name) } else { rv = append(rv, d.mounts...) - rv = append(rv, fmt.Sprintf("--runtime=%s", d.Runtime)) + if len(d.Runtime) > 0 { + rv = append(rv, fmt.Sprintf("--runtime=%s", d.Runtime)) + } rv = append(rv, fmt.Sprintf("--name=%s", d.Name)) rv = append(rv, testutil.ImageByName(r.Image)) } @@ -477,6 +539,56 @@ func (d *Docker) FindIP() (net.IP, error) { return ip, nil } +// A NetworkInterface is container's network interface information. +type NetworkInterface struct { + IPv4 net.IP + MAC net.HardwareAddr +} + +// ListNetworks returns the network interfaces of the container, keyed by +// Docker network name. +func (d *Docker) ListNetworks() (map[string]NetworkInterface, error) { + const format = `{{json .NetworkSettings.Networks}}` + out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput() + if err != nil { + return nil, fmt.Errorf("error network interfaces: %q: %w", string(out), err) + } + + networks := map[string]map[string]string{} + if err := json.Unmarshal(out, &networks); err != nil { + return nil, fmt.Errorf("error decoding network interfaces: %w", err) + } + + interfaces := map[string]NetworkInterface{} + for name, iface := range networks { + var netface NetworkInterface + + rawIP := strings.TrimSpace(iface["IPAddress"]) + if rawIP != "" { + ip := net.ParseIP(rawIP) + if ip == nil { + return nil, fmt.Errorf("invalid IP: %q", rawIP) + } + // Docker's IPAddress field is IPv4. The IPv6 address + // is stored in the GlobalIPv6Address field. + netface.IPv4 = ip + } + + rawMAC := strings.TrimSpace(iface["MacAddress"]) + if rawMAC != "" { + mac, err := net.ParseMAC(rawMAC) + if err != nil { + return nil, fmt.Errorf("invalid MAC: %q: %w", rawMAC, err) + } + netface.MAC = mac + } + + interfaces[name] = netface + } + + return interfaces, nil +} + // SandboxPid returns the PID to the sandbox process. func (d *Docker) SandboxPid() (int, error) { out, err := testutil.Command(d.logger, "docker", "inspect", "-f={{.State.Pid}}", d.Name).CombinedOutput() diff --git a/pkg/usermem/addr.go b/pkg/usermem/addr.go index e79210804..c4100481e 100644 --- a/pkg/usermem/addr.go +++ b/pkg/usermem/addr.go @@ -106,3 +106,20 @@ func (ar AddrRange) IsPageAligned() bool { func (ar AddrRange) String() string { return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End) } + +// PageRoundDown/Up are equivalent to Addr.RoundDown/Up, but without the +// potentially truncating conversion from uint64 to Addr. This is necessary +// because there is no way to define generic "PageRoundDown/Up" functions in Go. + +// PageRoundDown returns x rounded down to the nearest page boundary. +func PageRoundDown(x uint64) uint64 { + return x &^ (PageSize - 1) +} + +// PageRoundUp returns x rounded up to the nearest page boundary. +// ok is true iff rounding up did not wrap around. +func PageRoundUp(x uint64) (addr uint64, ok bool) { + addr = PageRoundDown(x + PageSize - 1) + ok = addr >= x + return +} diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index b7cfb35bf..84c67cbc2 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -119,7 +119,13 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) { } if tr.shouldReport(regs) { - c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs) + name := c.nameMap.Name(uintptr(sysnr)) + c.sink.Infof("Unsupported syscall %s(%#x,%#x,%#x,%#x,%#x,%#x). It is "+ + "likely that you can safely ignore this message and that this is not "+ + "the cause of any error. Please, refer to %s/%s for more information.", + name, argVal(0, regs), argVal(1, regs), argVal(2, regs), argVal(3, regs), + argVal(4, regs), argVal(5, regs), syscallLink, name) + tr.onReported(regs) } } diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go index 42b0ca8b0..8eb76b2ba 100644 --- a/runsc/boot/compat_amd64.go +++ b/runsc/boot/compat_amd64.go @@ -24,8 +24,12 @@ import ( "gvisor.dev/gvisor/pkg/sentry/strace" ) -// reportLimit is the max number of events that should be reported per tracker. -const reportLimit = 100 +const ( + // reportLimit is the max number of events that should be reported per + // tracker. + reportLimit = 100 + syscallLink = "https://gvisor.dev/c/linux/amd64" +) // newRegs create a empty Registers instance. func newRegs() *rpb.Registers { @@ -36,22 +40,22 @@ func newRegs() *rpb.Registers { } } -func argVal(argIdx int, regs *rpb.Registers) uint32 { +func argVal(argIdx int, regs *rpb.Registers) uint64 { amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 switch argIdx { case 0: - return uint32(amd64Regs.Rdi) + return amd64Regs.Rdi case 1: - return uint32(amd64Regs.Rsi) + return amd64Regs.Rsi case 2: - return uint32(amd64Regs.Rdx) + return amd64Regs.Rdx case 3: - return uint32(amd64Regs.R10) + return amd64Regs.R10 case 4: - return uint32(amd64Regs.R8) + return amd64Regs.R8 case 5: - return uint32(amd64Regs.R9) + return amd64Regs.R9 } panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go index f784cd237..bce9d95b3 100644 --- a/runsc/boot/compat_arm64.go +++ b/runsc/boot/compat_arm64.go @@ -23,8 +23,12 @@ import ( "gvisor.dev/gvisor/pkg/sentry/strace" ) -// reportLimit is the max number of events that should be reported per tracker. -const reportLimit = 100 +const ( + // reportLimit is the max number of events that should be reported per + // tracker. + reportLimit = 100 + syscallLink = "https://gvisor.dev/c/linux/arm64" +) // newRegs create a empty Registers instance. func newRegs() *rpb.Registers { @@ -35,22 +39,22 @@ func newRegs() *rpb.Registers { } } -func argVal(argIdx int, regs *rpb.Registers) uint32 { +func argVal(argIdx int, regs *rpb.Registers) uint64 { arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 switch argIdx { case 0: - return uint32(arm64Regs.R0) + return arm64Regs.R0 case 1: - return uint32(arm64Regs.R1) + return arm64Regs.R1 case 2: - return uint32(arm64Regs.R2) + return arm64Regs.R2 case 3: - return uint32(arm64Regs.R3) + return arm64Regs.R3 case 4: - return uint32(arm64Regs.R4) + return arm64Regs.R4 case 5: - return uint32(arm64Regs.R5) + return arm64Regs.R5 } panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 8df5cc989..e1181271a 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -770,14 +770,8 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( useOverlay bool ) - for _, opt := range m.Options { - // When options include either "bind" or "rbind", this behaves as - // bind mount even if the mount type is equal to a filesystem supported - // on runsc. - if opt == "bind" || opt == "rbind" { - m.Type = bind - break - } + if isBindMount(m) { + m.Type = bind } switch m.Type { @@ -807,6 +801,18 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( return fsName, opts, useOverlay, nil } +func isBindMount(m specs.Mount) bool { + for _, opt := range m.Options { + // When options include either "bind" or "rbind", this behaves as + // bind mount even if the mount type is equal to a filesystem supported + // on runsc. + if opt == "bind" || opt == "rbind" { + return true + } + } + return false +} + func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType { if hint := c.hints.findMount(mount); hint != nil { return hint.fileAccessType() diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 7378fbc95..147c901c4 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -203,28 +203,61 @@ func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *C } func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { - c.prepareMountsVFS2() + mounts, err := c.prepareMountsVFS2() + if err != nil { + return err + } - for _, submount := range c.mounts { + for i := range mounts { + submount := &mounts[i] log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) - if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil { + if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { return err } } // TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go. - return c.checkDispenser() + return nil +} + +type mountAndFD struct { + specs.Mount + fd int } -func (c *containerMounter) prepareMountsVFS2() { +func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { + // Associate bind mounts with their FDs before sorting since there is an + // undocumented assumption that FDs are dispensed in the order in which + // they are required by mounts. + var mounts []mountAndFD + for _, m := range c.mounts { + fd := -1 + // Only bind mounts use host FDs; see + // containerMounter.getMountNameAndOptionsVFS2. + if m.Type == bind || isBindMount(m) { + fd = c.fds.remove() + } + mounts = append(mounts, mountAndFD{ + Mount: m, + fd: fd, + }) + } + if err := c.checkDispenser(); err != nil { + return nil, err + } + // Sort the mounts so that we don't place children before parents. - sort.Slice(c.mounts, func(i, j int) bool { return len(c.mounts[i].Destination) < len(c.mounts[j].Destination) }) + sort.Slice(mounts, func(i, j int) bool { + return len(mounts[i].Destination) < len(mounts[j].Destination) + }) + + return mounts, nil } // TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 -// version. -func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error { +// version. +func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { root := mns.Root() defer root.DecRef() target := &vfs.PathOperation{ @@ -233,7 +266,7 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, Path: fspath.Parse(submount.Destination), } - fsName, options, useOverlay, err := c.getMountNameAndOptions(conf, *submount) + fsName, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount) if err != nil { return fmt.Errorf("mountOptions failed: %w", err) } @@ -263,6 +296,45 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, return nil } +// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values +// used for mounts. +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, []string, bool, error) { + var ( + fsName string + opts []string + useOverlay bool + ) + + if isBindMount(m.Mount) { + m.Type = bind + } + + switch m.Type { + case devpts.Name, devtmpfs.Name, proc.Name, sys.Name: + fsName = m.Type + case nonefs: + fsName = sys.Name + case tmpfs.Name: + fsName = m.Type + + var err error + opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...) + if err != nil { + return "", nil, false, err + } + + case bind: + fsName = gofer.Name + opts = p9MountOptions(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + + default: + log.Warningf("ignoring unknown filesystem type %q", m.Type) + } + return fsName, opts, useOverlay, nil +} + func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error { target := &vfs.PathOperation{ Root: root, diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index fa40ee509..19c8b0db6 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -19,6 +19,7 @@ package cgroup import ( "bufio" "context" + "errors" "fmt" "io/ioutil" "os" @@ -38,21 +39,23 @@ const ( cgroupRoot = "/sys/fs/cgroup" ) -var controllers = map[string]controller{ - "blkio": &blockIO{}, - "cpu": &cpu{}, - "cpuset": &cpuSet{}, - "memory": &memory{}, - "net_cls": &networkClass{}, - "net_prio": &networkPrio{}, - "pids": &pids{}, +var controllers = map[string]config{ + "blkio": config{ctrlr: &blockIO{}}, + "cpu": config{ctrlr: &cpu{}}, + "cpuset": config{ctrlr: &cpuSet{}}, + "memory": config{ctrlr: &memory{}}, + "net_cls": config{ctrlr: &networkClass{}}, + "net_prio": config{ctrlr: &networkPrio{}}, + "pids": config{ctrlr: &pids{}}, // These controllers either don't have anything in the OCI spec or is // irrelevant for a sandbox. - "devices": &noop{}, - "freezer": &noop{}, - "perf_event": &noop{}, - "systemd": &noop{}, + "devices": config{ctrlr: &noop{}}, + "freezer": config{ctrlr: &noop{}}, + "hugetlb": config{ctrlr: &noop{}, optional: true}, + "perf_event": config{ctrlr: &noop{}}, + "rdma": config{ctrlr: &noop{}, optional: true}, + "systemd": config{ctrlr: &noop{}}, } func setOptionalValueInt(path, name string, val *int64) error { @@ -196,8 +199,9 @@ func LoadPaths(pid string) (map[string]string, error) { return paths, nil } -// Cgroup represents a group inside all controllers. For example: Name='/foo/bar' -// maps to /sys/fs/cgroup/<controller>/foo/bar on all controllers. +// Cgroup represents a group inside all controllers. For example: +// Name='/foo/bar' maps to /sys/fs/cgroup/<controller>/foo/bar on +// all controllers. type Cgroup struct { Name string `json:"name"` Parents map[string]string `json:"parents"` @@ -245,13 +249,17 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { clean := specutils.MakeCleanup(func() { _ = c.Uninstall() }) defer clean.Clean() - for key, ctrl := range controllers { + for key, cfg := range controllers { path := c.makePath(key) if err := os.MkdirAll(path, 0755); err != nil { + if cfg.optional && errors.Is(err, syscall.EROFS) { + log.Infof("Skipping cgroup %q", key) + continue + } return err } if res != nil { - if err := ctrl.set(res, path); err != nil { + if err := cfg.ctrlr.set(res, path); err != nil { return err } } @@ -321,10 +329,13 @@ func (c *Cgroup) Join() (func(), error) { } // Now join the cgroups. - for key := range controllers { + for key, cfg := range controllers { path := c.makePath(key) log.Debugf("Joining cgroup %q", path) if err := setValue(path, "cgroup.procs", "0"); err != nil { + if cfg.optional && os.IsNotExist(err) { + continue + } return undo, err } } @@ -375,6 +386,11 @@ func (c *Cgroup) makePath(controllerName string) string { return filepath.Join(cgroupRoot, controllerName, path) } +type config struct { + ctrlr controller + optional bool +} + type controller interface { set(*specs.LinuxResources, string) error } diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go index c7d210140..cd85dabbb 100644 --- a/runsc/cmd/help.go +++ b/runsc/cmd/help.go @@ -65,16 +65,10 @@ func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{} switch f.NArg() { case 0: fmt.Fprintf(h.cdr.Output, "Usage: %s <flags> <subcommand> <subcommand args>\n\n", h.cdr.Name()) - fmt.Fprintf(h.cdr.Output, `runsc is a command line client for running applications packaged in the Open -Container Initiative (OCI) format. Applications run by runsc are run in an -isolated gVisor sandbox that emulates a Linux environment. + fmt.Fprintf(h.cdr.Output, `runsc is the gVisor container runtime. -gVisor is a user-space kernel, written in Go, that implements a substantial -portion of the Linux system call interface. It provides an additional layer -of isolation between running applications and the host operating system. - -Functionality is provided by subcommands. For additonal help on individual -subcommands use "%s %s <subcommand>". +Functionality is provided by subcommands. For help with a specific subcommand, +use "%s %s <subcommand>". `, h.cdr.Name(), h.Name()) h.cdr.VisitGroups(func(g *subcommands.CommandGroup) { diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 7ba301331..1a6d50d0d 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -1760,7 +1760,7 @@ func TestUserLog(t *testing.T) { if err != nil { t.Fatalf("error opening user log file %q: %v", userLog, err) } - if want := "Unsupported syscall: sched_rr_get_interval"; !strings.Contains(string(out), want) { + if want := "Unsupported syscall sched_rr_get_interval("; !strings.Contains(string(out), want) { t.Errorf("user log file doesn't contain %q, out: %s", want, string(out)) } } diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go index ff856883a..9cbb2ed5b 100644 --- a/test/e2e/integration_test.go +++ b/test/e2e/integration_test.go @@ -337,27 +337,53 @@ func TestJobControl(t *testing.T) { } } -// TestTmpFile checks that files inside '/tmp' are not overridden. In addition, -// it checks that working dir is created if it doesn't exit. +// TestWorkingDirCreation checks that working dir is created if it doesn't exit. +func TestWorkingDirCreation(t *testing.T) { + for _, tc := range []struct { + name string + workingDir string + }{ + {name: "root", workingDir: "/foo"}, + {name: "tmp", workingDir: "/tmp/foo"}, + } { + for _, readonly := range []bool{true, false} { + name := tc.name + if readonly { + name += "-readonly" + } + t.Run(name, func(t *testing.T) { + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + + opts := dockerutil.RunOpts{ + Image: "basic/alpine", + WorkDir: tc.workingDir, + ReadOnly: readonly, + } + got, err := d.Run(opts, "sh", "-c", "echo ${PWD}") + if err != nil { + t.Fatalf("docker run failed: %v", err) + } + if want := tc.workingDir + "\n"; want != got { + t.Errorf("invalid working dir, want: %q, got: %q", want, got) + } + }) + } + } +} + +// TestTmpFile checks that files inside '/tmp' are not overridden. func TestTmpFile(t *testing.T) { d := dockerutil.MakeDocker(t) defer d.CleanUp() - // Should work without ReadOnly - if _, err := d.Run(dockerutil.RunOpts{ - Image: "basic/alpine", - WorkDir: "/tmp/foo/bar", - }, "touch", "/tmp/foo/bar/file"); err != nil { + opts := dockerutil.RunOpts{Image: "tmpfile"} + got, err := d.Run(opts, "cat", "/tmp/foo/file.txt") + if err != nil { t.Fatalf("docker run failed: %v", err) } - - // Expect failure. - if _, err := d.Run(dockerutil.RunOpts{ - Image: "basic/alpine", - WorkDir: "/tmp/foo/bar", - ReadOnly: true, - }, "touch", "/tmp/foo/bar/file"); err == nil { - t.Fatalf("docker run expected failure, but succeeded") + if want := "123\n"; want != got { + t.Errorf("invalid file content, want: %q, got: %q", want, got) } } diff --git a/test/packetimpact/README.md b/test/packetimpact/README.md index a82ad996a..f46c67a0c 100644 --- a/test/packetimpact/README.md +++ b/test/packetimpact/README.md @@ -18,6 +18,27 @@ Packetimpact aims to provide: * **Control-flow** like for loops, conditionals, and variables. * **Flexibilty** to specify every byte in a packet or use multiple sockets. +## How to run packetimpact tests? + +Build the test container image by running the following at the root of the +repository: + +```bash +$ make load-packetimpact +``` + +Run a test, e.g. `fin_wait2_timeout`, against Linux: + +```bash +$ bazel test //test/packetimpact/tests:fin_wait2_timeout_linux_test +``` + +Run the same test, but against gVisor: + +```bash +$ bazel test //test/packetimpact/tests:fin_wait2_timeout_netstack_test +``` + ## When to use packetimpact? There are a few ways to write networking tests for gVisor currently: diff --git a/test/packetimpact/netdevs/BUILD b/test/packetimpact/netdevs/BUILD new file mode 100644 index 000000000..422bb9b0c --- /dev/null +++ b/test/packetimpact/netdevs/BUILD @@ -0,0 +1,15 @@ +load("//tools:defs.bzl", "go_library") + +package( + licenses = ["notice"], +) + +go_library( + name = "netdevs", + srcs = ["netdevs.go"], + visibility = ["//test/packetimpact:__subpackages__"], + deps = [ + "//pkg/tcpip", + "//pkg/tcpip/header", + ], +) diff --git a/test/packetimpact/netdevs/netdevs.go b/test/packetimpact/netdevs/netdevs.go new file mode 100644 index 000000000..d2c9cfeaf --- /dev/null +++ b/test/packetimpact/netdevs/netdevs.go @@ -0,0 +1,104 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package netdevs contains utilities for working with network devices. +package netdevs + +import ( + "fmt" + "net" + "regexp" + "strings" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" +) + +// A DeviceInfo represents a network device. +type DeviceInfo struct { + MAC net.HardwareAddr + IPv4Addr net.IP + IPv4Net *net.IPNet + IPv6Addr net.IP + IPv6Net *net.IPNet +} + +var ( + deviceLine = regexp.MustCompile(`^\s*\d+: (\w+)`) + linkLine = regexp.MustCompile(`^\s*link/\w+ ([0-9a-fA-F:]+)`) + inetLine = regexp.MustCompile(`^\s*inet ([0-9./]+)`) + inet6Line = regexp.MustCompile(`^\s*inet6 ([0-9a-fA-Z:/]+)`) +) + +// ParseDevices parses the output from `ip addr show` into a map from device +// name to information about the device. +func ParseDevices(cmdOutput string) (map[string]DeviceInfo, error) { + var currentDevice string + var currentInfo DeviceInfo + deviceInfos := make(map[string]DeviceInfo) + for _, line := range strings.Split(cmdOutput, "\n") { + if m := deviceLine.FindStringSubmatch(line); m != nil { + if currentDevice != "" { + deviceInfos[currentDevice] = currentInfo + } + currentInfo = DeviceInfo{} + currentDevice = m[1] + } else if m := linkLine.FindStringSubmatch(line); m != nil { + mac, err := net.ParseMAC(m[1]) + if err != nil { + return nil, err + } + currentInfo.MAC = mac + } else if m := inetLine.FindStringSubmatch(line); m != nil { + ipv4Addr, ipv4Net, err := net.ParseCIDR(m[1]) + if err != nil { + return nil, err + } + currentInfo.IPv4Addr = ipv4Addr + currentInfo.IPv4Net = ipv4Net + } else if m := inet6Line.FindStringSubmatch(line); m != nil { + ipv6Addr, ipv6Net, err := net.ParseCIDR(m[1]) + if err != nil { + return nil, err + } + currentInfo.IPv6Addr = ipv6Addr + currentInfo.IPv6Net = ipv6Net + } + } + if currentDevice != "" { + deviceInfos[currentDevice] = currentInfo + } + return deviceInfos, nil +} + +// MACToIP converts the MAC address to an IPv6 link local address as described +// in RFC 4291 page 20: https://tools.ietf.org/html/rfc4291#page-20 +func MACToIP(mac net.HardwareAddr) net.IP { + addr := make([]byte, header.IPv6AddressSize) + addr[0] = 0xfe + addr[1] = 0x80 + header.EthernetAdddressToModifiedEUI64IntoBuf(tcpip.LinkAddress(mac), addr[8:]) + return net.IP(addr) +} + +// FindDeviceByIP finds a DeviceInfo and device name from an IP address in the +// output of ParseDevices. +func FindDeviceByIP(ip net.IP, devices map[string]DeviceInfo) (string, DeviceInfo, error) { + for dev, info := range devices { + if info.IPv4Addr.Equal(ip) { + return dev, info, nil + } + } + return "", DeviceInfo{}, fmt.Errorf("can't find %s on any interface", ip) +} diff --git a/test/packetimpact/runner/BUILD b/test/packetimpact/runner/BUILD new file mode 100644 index 000000000..0b68a760a --- /dev/null +++ b/test/packetimpact/runner/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_test") + +package( + default_visibility = ["//test/packetimpact:__subpackages__"], + licenses = ["notice"], +) + +go_test( + name = "packetimpact_test", + srcs = ["packetimpact_test.go"], + tags = [ + # Not intended to be run directly. + "local", + "manual", + ], + deps = [ + "//pkg/test/dockerutil", + "//test/packetimpact/netdevs", + ], +) diff --git a/test/packetimpact/tests/defs.bzl b/test/packetimpact/runner/defs.bzl index 27c5de375..ea66b9756 100644 --- a/test/packetimpact/tests/defs.bzl +++ b/test/packetimpact/runner/defs.bzl @@ -11,12 +11,10 @@ def _packetimpact_test_impl(ctx): # permission problems, because all runfiles may not be owned by the # current user, and no other users will be mapped in that namespace. # Make sure that everything is readable here. - "find . -type f -exec chmod a+rx {} \\;", - "find . -type d -exec chmod a+rx {} \\;", - "%s %s --posix_server_binary %s --testbench_binary %s $@\n" % ( + "find . -type f -or -type d -exec chmod a+rx {} \\;", + "%s %s --testbench_binary %s $@\n" % ( test_runner.short_path, " ".join(ctx.attr.flags), - ctx.files._posix_server_binary[0].short_path, ctx.files.testbench_binary[0].short_path, ), ]) @@ -38,7 +36,7 @@ _packetimpact_test = rule( "_test_runner": attr.label( executable = True, cfg = "target", - default = ":test_runner", + default = ":packetimpact_test", ), "_posix_server_binary": attr.label( cfg = "target", @@ -69,6 +67,7 @@ def packetimpact_linux_test( Args: name: name of the test testbench_binary: the testbench binary + expect_failure: the test must fail **kwargs: all the other args, forwarded to _packetimpact_test """ expect_failure_flag = ["--expect_failure"] if expect_failure else [] @@ -106,15 +105,15 @@ def packetimpact_netstack_test( **kwargs ) -def packetimpact_go_test(name, size = "small", pure = True, linux = True, netstack = True, **kwargs): +def packetimpact_go_test(name, size = "small", pure = True, expect_linux_failure = False, expect_netstack_failure = False, **kwargs): """Add packetimpact tests written in go. Args: name: name of the test size: size of the test pure: make a static go binary - linux: generate a linux test - netstack: generate a netstack test + expect_linux_failure: the test must fail for Linux + expect_netstack_failure: the test must fail for Netstack **kwargs: all the other args, forwarded to go_test """ testbench_binary = name + "_test" @@ -127,11 +126,11 @@ def packetimpact_go_test(name, size = "small", pure = True, linux = True, netsta ) packetimpact_linux_test( name = name, - expect_failure = not linux, + expect_failure = expect_linux_failure, testbench_binary = testbench_binary, ) packetimpact_netstack_test( name = name, - expect_failure = not netstack, + expect_failure = expect_netstack_failure, testbench_binary = testbench_binary, ) diff --git a/test/packetimpact/runner/packetimpact_test.go b/test/packetimpact/runner/packetimpact_test.go new file mode 100644 index 000000000..ac13c8543 --- /dev/null +++ b/test/packetimpact/runner/packetimpact_test.go @@ -0,0 +1,312 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The runner starts docker containers and networking for a packetimpact test. +package packetimpact_test + +import ( + "flag" + "fmt" + "log" + "math/rand" + "net" + "path" + "strings" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/test/packetimpact/netdevs" +) + +// stringList implements flag.Value. +type stringList []string + +// String implements flag.Value.String. +func (l *stringList) String() string { + return strings.Join(*l, ",") +} + +// Set implements flag.Value.Set. +func (l *stringList) Set(value string) error { + *l = append(*l, value) + return nil +} + +var ( + dutPlatform = flag.String("dut_platform", "", "either \"linux\" or \"netstack\"") + testbenchBinary = flag.String("testbench_binary", "", "path to the testbench binary") + tshark = flag.Bool("tshark", false, "use more verbose tshark in logs instead of tcpdump") + extraTestArgs = stringList{} + expectFailure = flag.Bool("expect_failure", false, "expect that the test will fail when run") + + dutAddr = net.IPv4(0, 0, 0, 10) + testbenchAddr = net.IPv4(0, 0, 0, 20) +) + +const ctrlPort = "40000" + +// logger implements testutil.Logger. +// +// Labels logs based on their source and formats multi-line logs. +type logger string + +// Name implements testutil.Logger.Name. +func (l logger) Name() string { + return string(l) +} + +// Logf implements testutil.Logger.Logf. +func (l logger) Logf(format string, args ...interface{}) { + lines := strings.Split(fmt.Sprintf(format, args...), "\n") + log.Printf("%s: %s", l, lines[0]) + for _, line := range lines[1:] { + log.Printf("%*s %s", len(l), "", line) + } +} + +func TestOne(t *testing.T) { + flag.Var(&extraTestArgs, "extra_test_arg", "extra arguments to pass to the testbench") + flag.Parse() + if *dutPlatform != "linux" && *dutPlatform != "netstack" { + t.Fatal("--dut_platform should be either linux or netstack") + } + if *testbenchBinary == "" { + t.Fatal("--testbench_binary is missing") + } + if *dutPlatform == "netstack" { + if _, err := dockerutil.RuntimePath(); err != nil { + t.Fatal("--runtime is missing or invalid with --dut_platform=netstack:", err) + } + } + dockerutil.EnsureSupportedDockerVersion() + + // Create the networks needed for the test. One control network is needed for + // the gRPC control packets and one test network on which to transmit the test + // packets. + ctrlNet := dockerutil.NewDockerNetwork(logger("ctrlNet")) + testNet := dockerutil.NewDockerNetwork(logger("testNet")) + for _, dn := range []*dockerutil.DockerNetwork{ctrlNet, testNet} { + for { + if err := createDockerNetwork(dn); err != nil { + t.Log("creating docker network:", err) + const wait = 100 * time.Millisecond + t.Logf("sleeping %s and will try creating docker network again", wait) + // This can fail if another docker network claimed the same IP so we'll + // just try again. + time.Sleep(wait) + continue + } + break + } + defer func(dn *dockerutil.DockerNetwork) { + if err := dn.Cleanup(); err != nil { + t.Errorf("unable to cleanup container %s: %s", dn.Name, err) + } + }(dn) + } + + runOpts := dockerutil.RunOpts{ + Image: "packetimpact", + CapAdd: []string{"NET_ADMIN"}, + Extra: []string{"--sysctl", "net.ipv6.conf.all.disable_ipv6=0", "--rm"}, + Foreground: true, + } + + // Create the Docker container for the DUT. + dut := dockerutil.MakeDocker(logger("dut")) + if *dutPlatform == "linux" { + dut.Runtime = "" + } + + const containerPosixServerBinary = "/packetimpact/posix_server" + dut.CopyFiles("/packetimpact", "/test/packetimpact/dut/posix_server") + + if err := dut.Create(runOpts, containerPosixServerBinary, "--ip=0.0.0.0", "--port="+ctrlPort); err != nil { + t.Fatalf("unable to create container %s: %s", dut.Name, err) + } + defer dut.CleanUp() + + // Add ctrlNet as eth1 and testNet as eth2. + const testNetDev = "eth2" + if err := addNetworks(dut, dutAddr, []*dockerutil.DockerNetwork{ctrlNet, testNet}); err != nil { + t.Fatal(err) + } + + if err := dut.Start(); err != nil { + t.Fatalf("unable to start container %s: %s", dut.Name, err) + } + + if _, err := dut.WaitForOutput("Server listening.*\n", 60*time.Second); err != nil { + t.Fatalf("%s on container %s never listened: %s", containerPosixServerBinary, dut.Name, err) + } + + dutTestDevice, dutDeviceInfo, err := deviceByIP(dut, addressInSubnet(dutAddr, *testNet.Subnet)) + if err != nil { + t.Fatal(err) + } + + remoteMAC := dutDeviceInfo.MAC + remoteIPv6 := dutDeviceInfo.IPv6Addr + // Netstack as DUT doesn't assign IPv6 addresses automatically so do it if + // needed. + if remoteIPv6 == nil { + if _, err := dut.Exec(dockerutil.RunOpts{}, "ip", "addr", "add", netdevs.MACToIP(remoteMAC).String(), "scope", "link", "dev", dutTestDevice); err != nil { + t.Fatalf("unable to ip addr add on container %s: %s", dut.Name, err) + } + // Now try again, to make sure that it worked. + _, dutDeviceInfo, err = deviceByIP(dut, addressInSubnet(dutAddr, *testNet.Subnet)) + if err != nil { + t.Fatal(err) + } + remoteIPv6 = dutDeviceInfo.IPv6Addr + if remoteIPv6 == nil { + t.Fatal("unable to set IPv6 address on container", dut.Name) + } + } + + // Create the Docker container for the testbench. + testbench := dockerutil.MakeDocker(logger("testbench")) + testbench.Runtime = "" // The testbench always runs on Linux. + + tbb := path.Base(*testbenchBinary) + containerTestbenchBinary := "/packetimpact/" + tbb + testbench.CopyFiles("/packetimpact", "/test/packetimpact/tests/"+tbb) + + // Run tcpdump in the test bench unbuffered, without DNS resolution, just on + // the interface with the test packets. + snifferArgs := []string{ + "tcpdump", "-S", "-vvv", "-U", "-n", "-i", testNetDev, + } + snifferRegex := "tcpdump: listening.*\n" + if *tshark { + // Run tshark in the test bench unbuffered, without DNS resolution, just on + // the interface with the test packets. + snifferArgs = []string{ + "tshark", "-V", "-l", "-n", "-i", testNetDev, + "-o", "tcp.check_checksum:TRUE", + "-o", "udp.check_checksum:TRUE", + } + snifferRegex = "Capturing on.*\n" + } + + if err := testbench.Create(runOpts, snifferArgs...); err != nil { + t.Fatalf("unable to create container %s: %s", testbench.Name, err) + } + defer testbench.CleanUp() + + // Add ctrlNet as eth1 and testNet as eth2. + if err := addNetworks(testbench, testbenchAddr, []*dockerutil.DockerNetwork{ctrlNet, testNet}); err != nil { + t.Fatal(err) + } + + if err := testbench.Start(); err != nil { + t.Fatalf("unable to start container %s: %s", testbench.Name, err) + } + + // Kill so that it will flush output. + defer testbench.Exec(dockerutil.RunOpts{}, "killall", snifferArgs[0]) + + if _, err := testbench.WaitForOutput(snifferRegex, 60*time.Second); err != nil { + t.Fatalf("sniffer on %s never listened: %s", dut.Name, err) + } + + // Because the Linux kernel receives the SYN-ACK but didn't send the SYN it + // will issue a RST. To prevent this IPtables can be used to filter out all + // incoming packets. The raw socket that packetimpact tests use will still see + // everything. + if _, err := testbench.Exec(dockerutil.RunOpts{}, "iptables", "-A", "INPUT", "-i", testNetDev, "-j", "DROP"); err != nil { + t.Fatalf("unable to Exec iptables on container %s: %s", testbench.Name, err) + } + + // FIXME(b/156449515): Some piece of the system has a race. The old + // bash script version had a sleep, so we have one too. The race should + // be fixed and this sleep removed. + time.Sleep(time.Second) + + // Start a packetimpact test on the test bench. The packetimpact test sends + // and receives packets and also sends POSIX socket commands to the + // posix_server to be executed on the DUT. + testArgs := []string{containerTestbenchBinary} + testArgs = append(testArgs, extraTestArgs...) + testArgs = append(testArgs, + "--posix_server_ip", addressInSubnet(dutAddr, *ctrlNet.Subnet).String(), + "--posix_server_port", ctrlPort, + "--remote_ipv4", addressInSubnet(dutAddr, *testNet.Subnet).String(), + "--local_ipv4", addressInSubnet(testbenchAddr, *testNet.Subnet).String(), + "--remote_ipv6", remoteIPv6.String(), + "--remote_mac", remoteMAC.String(), + "--device", testNetDev, + ) + _, err = testbench.Exec(dockerutil.RunOpts{}, testArgs...) + if !*expectFailure && err != nil { + t.Fatal("test failed:", err) + } + if *expectFailure && err == nil { + t.Fatal("test failure expected but the test succeeded, enable the test and mark the corresponding bug as fixed") + } +} + +func addNetworks(d *dockerutil.Docker, addr net.IP, networks []*dockerutil.DockerNetwork) error { + for _, dn := range networks { + ip := addressInSubnet(addr, *dn.Subnet) + // Connect to the network with the specified IP address. + if err := dn.Connect(d, "--ip", ip.String()); err != nil { + return fmt.Errorf("unable to connect container %s to network %s: %w", d.Name, dn.Name, err) + } + } + return nil +} + +// addressInSubnet combines the subnet provided with the address and returns a +// new address. The return address bits come from the subnet where the mask is 1 +// and from the ip address where the mask is 0. +func addressInSubnet(addr net.IP, subnet net.IPNet) net.IP { + var octets []byte + for i := 0; i < 4; i++ { + octets = append(octets, (subnet.IP.To4()[i]&subnet.Mask[i])+(addr.To4()[i]&(^subnet.Mask[i]))) + } + return net.IP(octets) +} + +// makeDockerNetwork makes a randomly-named network that will start with the +// namePrefix. The network will be a random /24 subnet. +func createDockerNetwork(n *dockerutil.DockerNetwork) error { + randSource := rand.NewSource(time.Now().UnixNano()) + r1 := rand.New(randSource) + // Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24. + ip := net.IPv4(byte(r1.Intn(224-192)+192), byte(r1.Intn(256)), byte(r1.Intn(256)), 0) + n.Subnet = &net.IPNet{ + IP: ip, + Mask: ip.DefaultMask(), + } + return n.Create() +} + +// deviceByIP finds a deviceInfo and device name from an IP address. +func deviceByIP(d *dockerutil.Docker, ip net.IP) (string, netdevs.DeviceInfo, error) { + out, err := d.Exec(dockerutil.RunOpts{}, "ip", "addr", "show") + if err != nil { + return "", netdevs.DeviceInfo{}, fmt.Errorf("listing devices on %s container: %w", d.Name, err) + } + devs, err := netdevs.ParseDevices(out) + if err != nil { + return "", netdevs.DeviceInfo{}, fmt.Errorf("parsing devices from %s container: %w", d.Name, err) + } + testDevice, deviceInfo, err := netdevs.FindDeviceByIP(ip, devs) + if err != nil { + return "", netdevs.DeviceInfo{}, fmt.Errorf("can't find deviceInfo for container %s: %w", d.Name, err) + } + return testDevice, deviceInfo, nil +} diff --git a/test/packetimpact/testbench/BUILD b/test/packetimpact/testbench/BUILD index fed51006f..d19ec07d4 100644 --- a/test/packetimpact/testbench/BUILD +++ b/test/packetimpact/testbench/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/tcpip/header", "//pkg/tcpip/seqnum", "//pkg/usermem", + "//test/packetimpact/netdevs", "//test/packetimpact/proto:posix_server_go_proto", "@com_github_google_go-cmp//cmp:go_default_library", "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", @@ -39,6 +40,7 @@ go_test( library = ":testbench", deps = [ "//pkg/tcpip", + "//pkg/tcpip/header", "@com_github_mohae_deepcopy//:go_default_library", ], ) diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go index 463fd0556..bf104e5ca 100644 --- a/test/packetimpact/testbench/connections.go +++ b/test/packetimpact/testbench/connections.go @@ -114,12 +114,12 @@ var _ layerState = (*etherState)(nil) func newEtherState(out, in Ether) (*etherState, error) { lMAC, err := tcpip.ParseMACAddress(LocalMAC) if err != nil { - return nil, err + return nil, fmt.Errorf("parsing local MAC: %q: %w", LocalMAC, err) } rMAC, err := tcpip.ParseMACAddress(RemoteMAC) if err != nil { - return nil, err + return nil, fmt.Errorf("parsing remote MAC: %q: %w", RemoteMAC, err) } s := etherState{ out: Ether{SrcAddr: &lMAC, DstAddr: &rMAC}, diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go index a78b7d7ee..b919a3c2e 100644 --- a/test/packetimpact/testbench/dut.go +++ b/test/packetimpact/testbench/dut.go @@ -16,6 +16,7 @@ package testbench import ( "context" + "flag" "net" "strconv" "syscall" @@ -37,6 +38,11 @@ type DUT struct { // NewDUT creates a new connection with the DUT over gRPC. func NewDUT(t *testing.T) DUT { + flag.Parse() + if err := genPseudoFlags(); err != nil { + t.Fatal("generating psuedo flags:", err) + } + posixServerAddress := POSIXServerIP + ":" + strconv.Itoa(POSIXServerPort) conn, err := grpc.Dial(posixServerAddress, grpc.WithInsecure(), grpc.WithKeepaliveParams(keepalive.ClientParameters{Timeout: RPCKeepalive})) if err != nil { diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go index 49370377d..1b0e5b8fc 100644 --- a/test/packetimpact/testbench/layers.go +++ b/test/packetimpact/testbench/layers.go @@ -689,6 +689,7 @@ type TCP struct { WindowSize *uint16 Checksum *uint16 UrgentPointer *uint16 + Options []byte } func (l *TCP) String() string { @@ -697,7 +698,7 @@ func (l *TCP) String() string { // ToBytes implements Layer.ToBytes. func (l *TCP) ToBytes() ([]byte, error) { - b := make([]byte, header.TCPMinimumSize) + b := make([]byte, l.length()) h := header.TCP(b) if l.SrcPort != nil { h.SetSourcePort(*l.SrcPort) @@ -727,6 +728,8 @@ func (l *TCP) ToBytes() ([]byte, error) { if l.UrgentPointer != nil { h.SetUrgentPoiner(*l.UrgentPointer) } + copy(b[header.TCPMinimumSize:], l.Options) + header.AddTCPOptionPadding(b[header.TCPMinimumSize:], len(l.Options)) if l.Checksum != nil { h.SetChecksum(*l.Checksum) return h, nil @@ -811,6 +814,7 @@ func parseTCP(b []byte) (Layer, layerParser) { WindowSize: Uint16(h.WindowSize()), Checksum: Uint16(h.Checksum()), UrgentPointer: Uint16(h.UrgentPointer()), + Options: b[header.TCPMinimumSize:h.DataOffset()], } return &tcp, parsePayload } @@ -821,7 +825,12 @@ func (l *TCP) match(other Layer) bool { func (l *TCP) length() int { if l.DataOffset == nil { - return header.TCPMinimumSize + // TCP header including the options must end on a 32-bit + // boundary; the user could potentially give us a slice + // whose length is not a multiple of 4 bytes, so we have + // to do the alignment here. + optlen := (len(l.Options) + 3) & ^3 + return header.TCPMinimumSize + optlen } return int(*l.DataOffset) } diff --git a/test/packetimpact/testbench/layers_test.go b/test/packetimpact/testbench/layers_test.go index 96f72de5b..c7f00e70d 100644 --- a/test/packetimpact/testbench/layers_test.go +++ b/test/packetimpact/testbench/layers_test.go @@ -15,10 +15,13 @@ package testbench import ( + "bytes" + "net" "testing" "github.com/mohae/deepcopy" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" ) func TestLayerMatch(t *testing.T) { @@ -393,3 +396,112 @@ func TestLayersDiff(t *testing.T) { } } } + +func TestTCPOptions(t *testing.T) { + for _, tt := range []struct { + description string + wantBytes []byte + wantLayers Layers + }{ + { + description: "without payload", + wantBytes: []byte{ + // IPv4 Header + 0x45, 0x00, 0x00, 0x2c, 0x00, 0x01, 0x00, 0x00, 0x40, 0x06, + 0xf9, 0x77, 0xc0, 0xa8, 0x00, 0x02, 0xc0, 0xa8, 0x00, 0x01, + // TCP Header + 0x30, 0x39, 0xd4, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x60, 0x02, 0x20, 0x00, 0xf5, 0x1c, 0x00, 0x00, + // WindowScale Option + 0x03, 0x03, 0x02, + // NOP Option + 0x00, + }, + wantLayers: []Layer{ + &IPv4{ + IHL: Uint8(20), + TOS: Uint8(0), + TotalLength: Uint16(44), + ID: Uint16(1), + Flags: Uint8(0), + FragmentOffset: Uint16(0), + TTL: Uint8(64), + Protocol: Uint8(uint8(header.TCPProtocolNumber)), + Checksum: Uint16(0xf977), + SrcAddr: Address(tcpip.Address(net.ParseIP("192.168.0.2").To4())), + DstAddr: Address(tcpip.Address(net.ParseIP("192.168.0.1").To4())), + }, + &TCP{ + SrcPort: Uint16(12345), + DstPort: Uint16(54321), + SeqNum: Uint32(0), + AckNum: Uint32(0), + Flags: Uint8(header.TCPFlagSyn), + WindowSize: Uint16(8192), + Checksum: Uint16(0xf51c), + UrgentPointer: Uint16(0), + Options: []byte{3, 3, 2, 0}, + }, + &Payload{Bytes: nil}, + }, + }, + { + description: "with payload", + wantBytes: []byte{ + // IPv4 header + 0x45, 0x00, 0x00, 0x37, 0x00, 0x01, 0x00, 0x00, 0x40, 0x06, + 0xf9, 0x6c, 0xc0, 0xa8, 0x00, 0x02, 0xc0, 0xa8, 0x00, 0x01, + // TCP header + 0x30, 0x39, 0xd4, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x60, 0x02, 0x20, 0x00, 0xe5, 0x21, 0x00, 0x00, + // WindowScale Option + 0x03, 0x03, 0x02, + // NOP Option + 0x00, + // Payload: "Sample Data" + 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x44, 0x61, 0x74, 0x61, + }, + wantLayers: []Layer{ + &IPv4{ + IHL: Uint8(20), + TOS: Uint8(0), + TotalLength: Uint16(55), + ID: Uint16(1), + Flags: Uint8(0), + FragmentOffset: Uint16(0), + TTL: Uint8(64), + Protocol: Uint8(uint8(header.TCPProtocolNumber)), + Checksum: Uint16(0xf96c), + SrcAddr: Address(tcpip.Address(net.ParseIP("192.168.0.2").To4())), + DstAddr: Address(tcpip.Address(net.ParseIP("192.168.0.1").To4())), + }, + &TCP{ + SrcPort: Uint16(12345), + DstPort: Uint16(54321), + SeqNum: Uint32(0), + AckNum: Uint32(0), + Flags: Uint8(header.TCPFlagSyn), + WindowSize: Uint16(8192), + Checksum: Uint16(0xe521), + UrgentPointer: Uint16(0), + Options: []byte{3, 3, 2, 0}, + }, + &Payload{Bytes: []byte("Sample Data")}, + }, + }, + } { + t.Run(tt.description, func(t *testing.T) { + layers := parse(parseIPv4, tt.wantBytes) + if !layers.match(tt.wantLayers) { + t.Fatalf("match failed with diff: %s", layers.diff(tt.wantLayers)) + } + gotBytes, err := layers.ToBytes() + if err != nil { + t.Fatalf("ToBytes() failed on %s: %s", &layers, err) + } + if !bytes.Equal(tt.wantBytes, gotBytes) { + t.Fatalf("mismatching bytes, gotBytes: %x, wantBytes: %x", gotBytes, tt.wantBytes) + } + }) + } +} diff --git a/test/packetimpact/testbench/rawsockets.go b/test/packetimpact/testbench/rawsockets.go index 4665f60b2..278229b7e 100644 --- a/test/packetimpact/testbench/rawsockets.go +++ b/test/packetimpact/testbench/rawsockets.go @@ -16,7 +16,6 @@ package testbench import ( "encoding/binary" - "flag" "fmt" "math" "net" @@ -41,7 +40,6 @@ func htons(x uint16) uint16 { // NewSniffer creates a Sniffer connected to *device. func NewSniffer(t *testing.T) (Sniffer, error) { - flag.Parse() snifferFd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, int(htons(unix.ETH_P_ALL))) if err != nil { return Sniffer{}, err @@ -136,7 +134,6 @@ type Injector struct { // NewInjector creates a new injector on *device. func NewInjector(t *testing.T) (Injector, error) { - flag.Parse() ifInfo, err := net.InterfaceByName(Device) if err != nil { return Injector{}, err diff --git a/test/packetimpact/testbench/testbench.go b/test/packetimpact/testbench/testbench.go index a1242b189..4de2aa1d3 100644 --- a/test/packetimpact/testbench/testbench.go +++ b/test/packetimpact/testbench/testbench.go @@ -16,7 +16,12 @@ package testbench import ( "flag" + "fmt" + "net" + "os/exec" "time" + + "gvisor.dev/gvisor/test/packetimpact/netdevs" ) var ( @@ -55,9 +60,31 @@ func RegisterFlags(fs *flag.FlagSet) { fs.DurationVar(&RPCKeepalive, "rpc_keepalive", RPCKeepalive, "gRPC keepalive") fs.StringVar(&LocalIPv4, "local_ipv4", LocalIPv4, "local IPv4 address for test packets") fs.StringVar(&RemoteIPv4, "remote_ipv4", RemoteIPv4, "remote IPv4 address for test packets") - fs.StringVar(&LocalIPv6, "local_ipv6", LocalIPv6, "local IPv6 address for test packets") fs.StringVar(&RemoteIPv6, "remote_ipv6", RemoteIPv6, "remote IPv6 address for test packets") - fs.StringVar(&LocalMAC, "local_mac", LocalMAC, "local mac address for test packets") fs.StringVar(&RemoteMAC, "remote_mac", RemoteMAC, "remote mac address for test packets") fs.StringVar(&Device, "device", Device, "local device for test packets") } + +// genPseudoFlags populates flag-like global config based on real flags. +// +// genPseudoFlags must only be called after flag.Parse. +func genPseudoFlags() error { + out, err := exec.Command("ip", "addr", "show").CombinedOutput() + if err != nil { + return fmt.Errorf("listing devices: %q: %w", string(out), err) + } + devs, err := netdevs.ParseDevices(string(out)) + if err != nil { + return fmt.Errorf("parsing devices: %w", err) + } + + _, deviceInfo, err := netdevs.FindDeviceByIP(net.ParseIP(LocalIPv4), devs) + if err != nil { + return fmt.Errorf("can't find deviceInfo: %w", err) + } + + LocalMAC = deviceInfo.MAC.String() + LocalIPv6 = deviceInfo.IPv6Addr.String() + + return nil +} diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD index c25b3b8c1..3a0e9cb07 100644 --- a/test/packetimpact/tests/BUILD +++ b/test/packetimpact/tests/BUILD @@ -1,4 +1,4 @@ -load("defs.bzl", "packetimpact_go_test") +load("//test/packetimpact/runner:defs.bzl", "packetimpact_go_test") package( default_visibility = ["//test/packetimpact:__subpackages__"], @@ -19,7 +19,7 @@ packetimpact_go_test( name = "udp_recv_multicast", srcs = ["udp_recv_multicast_test.go"], # TODO(b/152813495): Fix netstack then remove the line below. - netstack = False, + expect_netstack_failure = True, deps = [ "//pkg/tcpip", "//pkg/tcpip/header", @@ -31,8 +31,6 @@ packetimpact_go_test( packetimpact_go_test( name = "udp_icmp_error_propagation", srcs = ["udp_icmp_error_propagation_test.go"], - # TODO(b/153926291): Fix netstack then remove the line below. - netstack = False, deps = [ "//pkg/tcpip", "//pkg/tcpip/header", @@ -82,6 +80,16 @@ packetimpact_go_test( ) packetimpact_go_test( + name = "tcp_retransmits", + srcs = ["tcp_retransmits_test.go"], + deps = [ + "//pkg/tcpip/header", + "//test/packetimpact/testbench", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +packetimpact_go_test( name = "tcp_outside_the_window", srcs = ["tcp_outside_the_window_test.go"], deps = [ @@ -106,7 +114,7 @@ packetimpact_go_test( name = "tcp_should_piggyback", srcs = ["tcp_should_piggyback_test.go"], # TODO(b/153680566): Fix netstack then remove the line below. - netstack = False, + expect_netstack_failure = True, deps = [ "//pkg/tcpip/header", "//test/packetimpact/testbench", @@ -126,6 +134,19 @@ packetimpact_go_test( ) packetimpact_go_test( + name = "tcp_paws_mechanism", + srcs = ["tcp_paws_mechanism_test.go"], + # TODO(b/156682000): Fix netstack then remove the line below. + expect_netstack_failure = True, + deps = [ + "//pkg/tcpip/header", + "//pkg/tcpip/seqnum", + "//test/packetimpact/testbench", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +packetimpact_go_test( name = "tcp_user_timeout", srcs = ["tcp_user_timeout_test.go"], deps = [ @@ -139,7 +160,7 @@ packetimpact_go_test( name = "icmpv6_param_problem", srcs = ["icmpv6_param_problem_test.go"], # TODO(b/153485026): Fix netstack then remove the line below. - netstack = False, + expect_netstack_failure = True, deps = [ "//pkg/tcpip", "//pkg/tcpip/header", @@ -156,8 +177,3 @@ packetimpact_go_test( "@org_golang_x_sys//unix:go_default_library", ], ) - -sh_binary( - name = "test_runner", - srcs = ["test_runner.sh"], -) diff --git a/test/packetimpact/tests/tcp_paws_mechanism_test.go b/test/packetimpact/tests/tcp_paws_mechanism_test.go new file mode 100644 index 000000000..0a668adcf --- /dev/null +++ b/test/packetimpact/tests/tcp_paws_mechanism_test.go @@ -0,0 +1,109 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_paws_mechanism_test + +import ( + "encoding/hex" + "flag" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip/header" + tb "gvisor.dev/gvisor/test/packetimpact/testbench" +) + +func init() { + tb.RegisterFlags(flag.CommandLine) +} + +func TestPAWSMechanism(t *testing.T) { + dut := tb.NewDUT(t) + defer dut.TearDown() + listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(listenFD) + conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort}) + defer conn.Close() + + options := make([]byte, header.TCPOptionTSLength) + header.EncodeTSOption(currentTS(), 0, options) + conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn), Options: options}) + synAck, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second) + if err != nil { + t.Fatalf("didn't get synack during handshake: %s", err) + } + parsedSynOpts := header.ParseSynOptions(synAck.Options, true) + if !parsedSynOpts.TS { + t.Fatalf("expected TSOpt from DUT, options we got:\n%s", hex.Dump(synAck.Options)) + } + tsecr := parsedSynOpts.TSVal + header.EncodeTSOption(currentTS(), tsecr, options) + conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), Options: options}) + acceptFD, _ := dut.Accept(listenFD) + defer dut.Close(acceptFD) + + sampleData := []byte("Sample Data") + sentTSVal := currentTS() + header.EncodeTSOption(sentTSVal, tsecr, options) + // 3ms here is chosen arbitrarily to make sure we have increasing timestamps + // every time we send one, it should not cause any flakiness because timestamps + // only need to be non-decreasing. + time.Sleep(3 * time.Millisecond) + conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), Options: options}, &tb.Payload{Bytes: sampleData}) + + gotTCP, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}, time.Second) + if err != nil { + t.Fatalf("expected an ACK but got none: %s", err) + } + + parsedOpts := header.ParseTCPOptions(gotTCP.Options) + if !parsedOpts.TS { + t.Fatalf("expected TS option in response, options we got:\n%s", hex.Dump(gotTCP.Options)) + } + if parsedOpts.TSVal < tsecr { + t.Fatalf("TSVal should be non-decreasing, but %d < %d", parsedOpts.TSVal, tsecr) + } + if parsedOpts.TSEcr != sentTSVal { + t.Fatalf("TSEcr should match our sent TSVal, %d != %d", parsedOpts.TSEcr, sentTSVal) + } + tsecr = parsedOpts.TSVal + lastAckNum := gotTCP.AckNum + + badTSVal := sentTSVal - 100 + header.EncodeTSOption(badTSVal, tsecr, options) + // 3ms here is chosen arbitrarily and this time.Sleep() should not cause flakiness + // due to the exact same reasoning discussed above. + time.Sleep(3 * time.Millisecond) + conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), Options: options}, &tb.Payload{Bytes: sampleData}) + + gotTCP, err = conn.Expect(tb.TCP{AckNum: lastAckNum, Flags: tb.Uint8(header.TCPFlagAck)}, time.Second) + if err != nil { + t.Fatalf("expected segment with AckNum %d but got none: %s", lastAckNum, err) + } + parsedOpts = header.ParseTCPOptions(gotTCP.Options) + if !parsedOpts.TS { + t.Fatalf("expected TS option in response, options we got:\n%s", hex.Dump(gotTCP.Options)) + } + if parsedOpts.TSVal < tsecr { + t.Fatalf("TSVal should be non-decreasing, but %d < %d", parsedOpts.TSVal, tsecr) + } + if parsedOpts.TSEcr != sentTSVal { + t.Fatalf("TSEcr should match our sent TSVal, %d != %d", parsedOpts.TSEcr, sentTSVal) + } +} + +func currentTS() uint32 { + return uint32(time.Now().UnixNano() / 1e6) +} diff --git a/test/packetimpact/tests/tcp_retransmits_test.go b/test/packetimpact/tests/tcp_retransmits_test.go new file mode 100644 index 000000000..c043ad881 --- /dev/null +++ b/test/packetimpact/tests/tcp_retransmits_test.go @@ -0,0 +1,84 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_retransmits_test + +import ( + "flag" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip/header" + tb "gvisor.dev/gvisor/test/packetimpact/testbench" +) + +func init() { + tb.RegisterFlags(flag.CommandLine) +} + +// TestRetransmits tests retransmits occur at exponentially increasing +// time intervals. +func TestRetransmits(t *testing.T) { + dut := tb.NewDUT(t) + defer dut.TearDown() + listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(listenFd) + conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort}) + defer conn.Close() + + conn.Handshake() + acceptFd, _ := dut.Accept(listenFd) + defer dut.Close(acceptFd) + + dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + + sampleData := []byte("Sample Data") + samplePayload := &tb.Payload{Bytes: sampleData} + + dut.Send(acceptFd, sampleData, 0) + if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil { + t.Fatalf("expected a packet with payload %v: %s", samplePayload, err) + } + // Give a chance for the dut to estimate RTO with RTT from the DATA-ACK. + // TODO(gvisor.dev/issue/2685) Estimate RTO during handshake, after which + // we can skip sending this ACK. + conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}) + + startRTO := time.Second + current := startRTO + first := time.Now() + dut.Send(acceptFd, sampleData, 0) + seq := tb.Uint32(uint32(*conn.RemoteSeqNum())) + if _, err := conn.ExpectData(&tb.TCP{SeqNum: seq}, samplePayload, startRTO); err != nil { + t.Fatalf("expected a packet with payload %v: %s", samplePayload, err) + } + // Expect retransmits of the same segment. + for i := 0; i < 5; i++ { + start := time.Now() + if _, err := conn.ExpectData(&tb.TCP{SeqNum: seq}, samplePayload, 2*current); err != nil { + t.Fatalf("expected a packet with payload %v: %s loop %d", samplePayload, err, i) + } + if i == 0 { + startRTO = time.Now().Sub(first) + current = 2 * startRTO + continue + } + // Check if the probes came at exponentially increasing intervals. + if p := time.Since(start); p < current-startRTO { + t.Fatalf("retransmit came sooner interval %d probe %d\n", p, i) + } + current *= 2 + } +} diff --git a/test/packetimpact/tests/test_runner.sh b/test/packetimpact/tests/test_runner.sh deleted file mode 100755 index 706441cce..000000000 --- a/test/packetimpact/tests/test_runner.sh +++ /dev/null @@ -1,325 +0,0 @@ -#!/bin/bash - -# Copyright 2020 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Run a packetimpact test. Two docker containers are made, one for the -# Device-Under-Test (DUT) and one for the test bench. Each is attached with -# two networks, one for control packets that aid the test and one for test -# packets which are sent as part of the test and observed for correctness. - -set -euxo pipefail - -function failure() { - local lineno=$1 - local msg=$2 - local filename="$0" - echo "FAIL: $filename:$lineno: $msg" -} -trap 'failure ${LINENO} "$BASH_COMMAND"' ERR - -declare -r LONGOPTS="dut_platform:,posix_server_binary:,testbench_binary:,runtime:,tshark,extra_test_arg:,expect_failure" - -# Don't use declare below so that the error from getopt will end the script. -PARSED=$(getopt --options "" --longoptions=$LONGOPTS --name "$0" -- "$@") - -eval set -- "$PARSED" - -declare -a EXTRA_TEST_ARGS - -while true; do - case "$1" in - --dut_platform) - # Either "linux" or "netstack". - declare -r DUT_PLATFORM="$2" - shift 2 - ;; - --posix_server_binary) - declare -r POSIX_SERVER_BINARY="$2" - shift 2 - ;; - --testbench_binary) - declare -r TESTBENCH_BINARY="$2" - shift 2 - ;; - --runtime) - # Not readonly because there might be multiple --runtime arguments and we - # want to use just the last one. Only used if --dut_platform is - # "netstack". - declare RUNTIME="$2" - shift 2 - ;; - --tshark) - declare -r TSHARK="1" - shift 1 - ;; - --extra_test_arg) - EXTRA_TEST_ARGS+="$2" - shift 2 - ;; - --expect_failure) - declare -r EXPECT_FAILURE="1" - shift 1 - ;; - --) - shift - break - ;; - *) - echo "Programming error" - exit 3 - esac -done - -# All the other arguments are scripts. -declare -r scripts="$@" - -# Check that the required flags are defined in a way that is safe for "set -u". -if [[ "${DUT_PLATFORM-}" == "netstack" ]]; then - if [[ -z "${RUNTIME-}" ]]; then - echo "FAIL: Missing --runtime argument: ${RUNTIME-}" - exit 2 - fi - declare -r RUNTIME_ARG="--runtime ${RUNTIME}" -elif [[ "${DUT_PLATFORM-}" == "linux" ]]; then - declare -r RUNTIME_ARG="" -else - echo "FAIL: Bad or missing --dut_platform argument: ${DUT_PLATFORM-}" - exit 2 -fi -if [[ ! -f "${POSIX_SERVER_BINARY-}" ]]; then - echo "FAIL: Bad or missing --posix_server_binary: ${POSIX_SERVER-}" - exit 2 -fi -if [[ ! -f "${TESTBENCH_BINARY-}" ]]; then - echo "FAIL: Bad or missing --testbench_binary: ${TESTBENCH_BINARY-}" - exit 2 -fi - -function new_net_prefix() { - # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24. - echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)" -} - -# Variables specific to the control network and interface start with CTRL_. -# Variables specific to the test network and interface start with TEST_. -# Variables specific to the DUT start with DUT_. -# Variables specific to the test bench start with TESTBENCH_. -# Use random numbers so that test networks don't collide. -declare CTRL_NET="ctrl_net-${RANDOM}${RANDOM}" -declare CTRL_NET_PREFIX=$(new_net_prefix) -declare TEST_NET="test_net-${RANDOM}${RANDOM}" -declare TEST_NET_PREFIX=$(new_net_prefix) -# On both DUT and test bench, testing packets are on the eth2 interface. -declare -r TEST_DEVICE="eth2" -# Number of bits in the *_NET_PREFIX variables. -declare -r NET_MASK="24" -# Last bits of the DUT's IP address. -declare -r DUT_NET_SUFFIX=".10" -# Control port. -declare -r CTRL_PORT="40000" -# Last bits of the test bench's IP address. -declare -r TESTBENCH_NET_SUFFIX=".20" -declare -r TIMEOUT="60" -declare -r IMAGE_TAG="gcr.io/gvisor-presubmit/packetimpact" - -# Make sure that docker is installed. -docker --version - -function finish { - local cleanup_success=1 - - if [[ -z "${TSHARK-}" ]]; then - # Kill tcpdump so that it will flush output. - docker exec -t "${TESTBENCH}" \ - killall tcpdump || \ - cleanup_success=0 - else - # Kill tshark so that it will flush output. - docker exec -t "${TESTBENCH}" \ - killall tshark || \ - cleanup_success=0 - fi - - for net in "${CTRL_NET}" "${TEST_NET}"; do - # Kill all processes attached to ${net}. - for docker_command in "kill" "rm"; do - (docker network inspect "${net}" \ - --format '{{range $key, $value := .Containers}}{{$key}} {{end}}' \ - | xargs -r docker "${docker_command}") || \ - cleanup_success=0 - done - # Remove the network. - docker network rm "${net}" || \ - cleanup_success=0 - done - - if ((!$cleanup_success)); then - echo "FAIL: Cleanup command failed" - exit 4 - fi -} -trap finish EXIT - -# Subnet for control packets between test bench and DUT. -while ! docker network create \ - "--subnet=${CTRL_NET_PREFIX}.0/${NET_MASK}" "${CTRL_NET}"; do - sleep 0.1 - CTRL_NET_PREFIX=$(new_net_prefix) - CTRL_NET="ctrl_net-${RANDOM}${RANDOM}" -done - -# Subnet for the packets that are part of the test. -while ! docker network create \ - "--subnet=${TEST_NET_PREFIX}.0/${NET_MASK}" "${TEST_NET}"; do - sleep 0.1 - TEST_NET_PREFIX=$(new_net_prefix) - TEST_NET="test_net-${RANDOM}${RANDOM}" -done - -docker pull "${IMAGE_TAG}" - -# Create the DUT container and connect to network. -DUT=$(docker create ${RUNTIME_ARG} --privileged --rm \ - --cap-add NET_ADMIN \ - --sysctl net.ipv6.conf.all.disable_ipv6=0 \ - --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG}) -docker network connect "${CTRL_NET}" \ - --ip "${CTRL_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \ - || (docker kill ${DUT}; docker rm ${DUT}; false) -docker network connect "${TEST_NET}" \ - --ip "${TEST_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \ - || (docker kill ${DUT}; docker rm ${DUT}; false) -docker start "${DUT}" - -# Create the test bench container and connect to network. -TESTBENCH=$(docker create --privileged --rm \ - --cap-add NET_ADMIN \ - --sysctl net.ipv6.conf.all.disable_ipv6=0 \ - --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG}) -docker network connect "${CTRL_NET}" \ - --ip "${CTRL_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" "${TESTBENCH}" \ - || (docker kill ${TESTBENCH}; docker rm ${TESTBENCH}; false) -docker network connect "${TEST_NET}" \ - --ip "${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" "${TESTBENCH}" \ - || (docker kill ${TESTBENCH}; docker rm ${TESTBENCH}; false) -docker start "${TESTBENCH}" - -# Start the posix_server in the DUT. -declare -r DOCKER_POSIX_SERVER_BINARY="/$(basename ${POSIX_SERVER_BINARY})" -docker cp -L ${POSIX_SERVER_BINARY} "${DUT}:${DOCKER_POSIX_SERVER_BINARY}" - -docker exec -t "${DUT}" \ - /bin/bash -c "${DOCKER_POSIX_SERVER_BINARY} \ - --ip ${CTRL_NET_PREFIX}${DUT_NET_SUFFIX} \ - --port ${CTRL_PORT}" & - -# Because the Linux kernel receives the SYN-ACK but didn't send the SYN it will -# issue a RST. To prevent this IPtables can be used to filter those out. -docker exec "${TESTBENCH}" \ - iptables -A INPUT -i ${TEST_DEVICE} -j DROP - -# Wait for the DUT server to come up. Attempt to connect to it from the test -# bench every 100 milliseconds until success. -while ! docker exec "${TESTBENCH}" \ - nc -zv "${CTRL_NET_PREFIX}${DUT_NET_SUFFIX}" "${CTRL_PORT}"; do - sleep 0.1 -done - -declare -r REMOTE_MAC=$(docker exec -t "${DUT}" ip link show \ - "${TEST_DEVICE}" | tail -1 | cut -d' ' -f6) -declare -r LOCAL_MAC=$(docker exec -t "${TESTBENCH}" ip link show \ - "${TEST_DEVICE}" | tail -1 | cut -d' ' -f6) -declare REMOTE_IPV6=$(docker exec -t "${DUT}" ip addr show scope link \ - "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1) -declare -r LOCAL_IPV6=$(docker exec -t "${TESTBENCH}" ip addr show scope link \ - "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1) - -# Netstack as DUT doesn't assign IPv6 addresses automatically so do it if -# needed. Convert the MAC address to an IPv6 link local address as described in -# RFC 4291 page 20: https://tools.ietf.org/html/rfc4291#page-20 -if [[ -z "${REMOTE_IPV6}" ]]; then - # Split the octets of the MAC into an array of strings. - IFS=":" read -a REMOTE_OCTETS <<< "${REMOTE_MAC}" - # Flip the global bit. - REMOTE_OCTETS[0]=$(printf '%x' "$((0x${REMOTE_OCTETS[0]} ^ 2))") - # Add the IPv6 address. - docker exec "${DUT}" \ - ip addr add $(printf 'fe80::%02x%02x:%02xff:fe%02x:%02x%02x/64' \ - "0x${REMOTE_OCTETS[0]}" "0x${REMOTE_OCTETS[1]}" "0x${REMOTE_OCTETS[2]}" \ - "0x${REMOTE_OCTETS[3]}" "0x${REMOTE_OCTETS[4]}" "0x${REMOTE_OCTETS[5]}") \ - scope link \ - dev "${TEST_DEVICE}" - # Re-extract the IPv6 address. - # TODO(eyalsoha): Add "scope link" below when netstack supports correctly - # creating link-local IPv6 addresses. - REMOTE_IPV6=$(docker exec -t "${DUT}" ip addr show \ - "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1) -fi - -declare -r DOCKER_TESTBENCH_BINARY="/$(basename ${TESTBENCH_BINARY})" -docker cp -L "${TESTBENCH_BINARY}" "${TESTBENCH}:${DOCKER_TESTBENCH_BINARY}" - -if [[ -z "${TSHARK-}" ]]; then - # Run tcpdump in the test bench unbuffered, without dns resolution, just on - # the interface with the test packets. - docker exec -t "${TESTBENCH}" \ - tcpdump -S -vvv -U -n -i "${TEST_DEVICE}" \ - net "${TEST_NET_PREFIX}/24" or \ - host "${REMOTE_IPV6}" or \ - host "${LOCAL_IPV6}" & -else - # Run tshark in the test bench unbuffered, without dns resolution, just on the - # interface with the test packets. - docker exec -t "${TESTBENCH}" \ - tshark -V -l -n -i "${TEST_DEVICE}" \ - -o tcp.check_checksum:TRUE \ - -o udp.check_checksum:TRUE \ - net "${TEST_NET_PREFIX}/24" or \ - host "${REMOTE_IPV6}" or \ - host "${LOCAL_IPV6}" & -fi - -# tcpdump and tshark take time to startup -sleep 3 - -# Start a packetimpact test on the test bench. The packetimpact test sends and -# receives packets and also sends POSIX socket commands to the posix_server to -# be executed on the DUT. -docker exec \ - -e XML_OUTPUT_FILE="/test.xml" \ - -e TEST_TARGET \ - -t "${TESTBENCH}" \ - /bin/bash -c "${DOCKER_TESTBENCH_BINARY} \ - ${EXTRA_TEST_ARGS[@]-} \ - --posix_server_ip=${CTRL_NET_PREFIX}${DUT_NET_SUFFIX} \ - --posix_server_port=${CTRL_PORT} \ - --remote_ipv4=${TEST_NET_PREFIX}${DUT_NET_SUFFIX} \ - --local_ipv4=${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX} \ - --remote_ipv6=${REMOTE_IPV6} \ - --local_ipv6=${LOCAL_IPV6} \ - --remote_mac=${REMOTE_MAC} \ - --local_mac=${LOCAL_MAC} \ - --device=${TEST_DEVICE}" && true -declare -r TEST_RESULT="${?}" -if [[ -z "${EXPECT_FAILURE-}" && "${TEST_RESULT}" != 0 ]]; then - echo 'FAIL: This test was expected to pass.' - exit ${TEST_RESULT} -fi -if [[ ! -z "${EXPECT_FAILURE-}" && "${TEST_RESULT}" == 0 ]]; then - echo 'FAIL: This test was expected to fail but passed. Enable the test and' \ - 'mark the corresponding bug as fixed.' - exit 1 -fi -echo PASS: No errors. diff --git a/test/runner/runner.go b/test/runner/runner.go index 14c9cbc47..e4f04cd2a 100644 --- a/test/runner/runner.go +++ b/test/runner/runner.go @@ -341,11 +341,13 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) { } } - // Set environment variables that indicate we are - // running in gVisor with the given platform and network. + // Set environment variables that indicate we are running in gVisor with + // the given platform, network, and filesystem stack. + // TODO(gvisor.dev/issue/1487): Update this when the runner supports VFS2. platformVar := "TEST_ON_GVISOR" networkVar := "GVISOR_NETWORK" - env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network) + vfsVar := "GVISOR_VFS" + env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network, vfsVar+"=VFS1") // Remove env variables that cause the gunit binary to write output // files, since they will stomp on eachother, and on the output files diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index adf259bba..5acdb8438 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -835,10 +835,7 @@ cc_binary( cc_binary( name = "fpsig_fork_test", testonly = 1, - srcs = select_arch( - amd64 = ["fpsig_fork.cc"], - arm64 = [], - ), + srcs = ["fpsig_fork.cc"], linkstatic = 1, deps = [ gtest, @@ -3288,6 +3285,7 @@ cc_binary( "//test/util:capability_util", "//test/util:file_descriptor", "//test/util:fs_util", + "@com_google_absl//absl/time", gtest, "//test/util:temp_path", "//test/util:test_main", diff --git a/test/syscalls/linux/fpsig_fork.cc b/test/syscalls/linux/fpsig_fork.cc index a346f1f00..d08111cd3 100644 --- a/test/syscalls/linux/fpsig_fork.cc +++ b/test/syscalls/linux/fpsig_fork.cc @@ -27,9 +27,22 @@ namespace testing { namespace { +#ifdef __x86_64__ #define GET_XMM(__var, __xmm) \ asm volatile("movq %%" #__xmm ", %0" : "=r"(__var)) #define SET_XMM(__var, __xmm) asm volatile("movq %0, %%" #__xmm : : "r"(__var)) +#define GET_FP0(__var) GET_XMM(__var, xmm0) +#define SET_FP0(__var) SET_XMM(__var, xmm0) +#elif __aarch64__ +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) +#define GET_FPREG(var, regname) \ + asm volatile("str "__stringify(regname) ", %0" : "=m"(var)) +#define SET_FPREG(var, regname) \ + asm volatile("ldr "__stringify(regname) ", %0" : "=m"(var)) +#define GET_FP0(var) GET_FPREG(var, d0) +#define SET_FP0(var) GET_FPREG(var, d0) +#endif int parent, child; @@ -40,7 +53,10 @@ void sigusr1(int s, siginfo_t* siginfo, void* _uc) { TEST_CHECK_MSG(child >= 0, "fork failed"); uint64_t val = SIGUSR1; - SET_XMM(val, xmm0); + SET_FP0(val); + uint64_t got; + GET_FP0(got); + TEST_CHECK_MSG(val == got, "Basic FP check failed in sigusr1()"); } TEST(FPSigTest, Fork) { @@ -67,8 +83,9 @@ TEST(FPSigTest, Fork) { // be the one clobbered. uint64_t expected = 0xdeadbeeffacefeed; - SET_XMM(expected, xmm0); + SET_FP0(expected); +#ifdef __x86_64__ asm volatile( "movl %[killnr], %%eax;" "movl %[parent], %%edi;" @@ -81,9 +98,18 @@ TEST(FPSigTest, Fork) { : "rax", "rdi", "rsi", "rdx", // Clobbered by syscall. "rcx", "r11"); +#elif __aarch64__ + asm volatile( + "mov x8, %0\n" + "mov x0, %1\n" + "mov x1, %2\n" + "mov x2, %3\n" + "svc #0\n" ::"r"(__NR_tgkill), + "r"(parent), "r"(parent_tid), "r"(SIGUSR1)); +#endif uint64_t got; - GET_XMM(got, xmm0); + GET_FP0(got); if (getpid() == parent) { // Parent. int status; diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc index dd981a278..e397d5f57 100644 --- a/test/syscalls/linux/itimer.cc +++ b/test/syscalls/linux/itimer.cc @@ -267,8 +267,19 @@ int TestSIGPROFFairness(absl::Duration sleep) { // Random save/restore is disabled as it introduces additional latency and // unpredictable distribution patterns. TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) { - // TODO(b/143247272): CPU time accounting is inaccurate for the KVM platform. - SKIP_IF(GvisorPlatform() == Platform::kKVM); + // On the KVM and ptrace platforms, switches between sentry and application + // context are sometimes extremely slow, causing the itimer to send SIGPROF to + // a thread that either already has one pending or has had SIGPROF delivered, + // but hasn't handled it yet (and thus therefore still has SIGPROF masked). In + // either case, since itimer signals are group-directed, signal sending falls + // back to notifying the thread group leader. ItimerSignalTest() fails if "too + // many" signals are delivered to the thread group leader, so these tests are + // flaky on these platforms. + // + // TODO(b/143247272): Clarify why context switches are so slow on KVM. + const auto gvisor_platform = GvisorPlatform(); + SKIP_IF(gvisor_platform == Platform::kKVM || + gvisor_platform == Platform::kPtrace); pid_t child; int execve_errno; @@ -291,8 +302,10 @@ TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) { // Random save/restore is disabled as it introduces additional latency and // unpredictable distribution patterns. TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyIdle_NoRandomSave) { - // TODO(b/143247272): CPU time accounting is inaccurate for the KVM platform. - SKIP_IF(GvisorPlatform() == Platform::kKVM); + // See comment in DeliversSIGPROFToThreadsRoughlyFairlyActive. + const auto gvisor_platform = GvisorPlatform(); + SKIP_IF(gvisor_platform == Platform::kKVM || + gvisor_platform == Platform::kPtrace); pid_t child; int execve_errno; diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc index 703d594a2..e0a4d0985 100644 --- a/test/syscalls/linux/socket.cc +++ b/test/syscalls/linux/socket.cc @@ -61,10 +61,8 @@ TEST(SocketTest, ProtocolInet) { } } -TEST(SocketTest, UnixSocketFileMode) { - // TODO(gvisor.dev/issue/1624): Re-enable this test once VFS1 is deleted. It - // should pass in VFS2. - SKIP_IF(IsRunningOnGvisor()); +TEST(SocketTest, UnixSocketStat) { + SKIP_IF(IsRunningWithVFS1()); FileDescriptor bound = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX)); @@ -83,13 +81,18 @@ TEST(SocketTest, UnixSocketFileMode) { struct stat statbuf = {}; ASSERT_THAT(stat(addr.sun_path, &statbuf), SyscallSucceeds()); + + // Mode should be S_IFSOCK. EXPECT_EQ(statbuf.st_mode, S_IFSOCK | sock_perm & ~mask); + + // Timestamps should be equal and non-zero. + EXPECT_NE(statbuf.st_atime, 0); + EXPECT_EQ(statbuf.st_atime, statbuf.st_mtime); + EXPECT_EQ(statbuf.st_atime, statbuf.st_ctime); } TEST(SocketTest, UnixConnectNeedsWritePerm) { - // TODO(gvisor.dev/issue/1624): Re-enable this test once VFS1 is deleted. It - // should succeed in VFS2. - SKIP_IF(IsRunningOnGvisor()); + SKIP_IF(IsRunningWithVFS1()); FileDescriptor bound = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX)); @@ -121,10 +124,7 @@ using SocketOpenTest = ::testing::TestWithParam<int>; // UDS cannot be opened. TEST_P(SocketOpenTest, Unix) { // FIXME(b/142001530): Open incorrectly succeeds on gVisor. - // - // TODO(gvisor.dev/issue/1624): Re-enable this test once VFS1 is deleted. It - // should succeed in VFS2. - SKIP_IF(IsRunningOnGvisor()); + SKIP_IF(IsRunningWithVFS1()); FileDescriptor bound = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX)); diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc index 27779e47c..fa81845fd 100644 --- a/test/syscalls/linux/socket_ip_tcp_generic.cc +++ b/test/syscalls/linux/socket_ip_tcp_generic.cc @@ -876,6 +876,51 @@ TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) { EXPECT_EQ(get, kAbove); } +TEST_P(TCPSocketPairTest, SetTCPWindowClampBelowMinRcvBufConnectedSocket) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + // Discover minimum receive buf by setting a really low value + // for the receive buffer. + constexpr int kZero = 0; + EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &kZero, + sizeof(kZero)), + SyscallSucceeds()); + + // Now retrieve the minimum value for SO_RCVBUF as the set above should + // have caused SO_RCVBUF for the socket to be set to the minimum. + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT( + getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + int min_so_rcvbuf = get; + + { + // Setting TCP_WINDOW_CLAMP to zero for a connected socket is not permitted. + constexpr int kZero = 0; + EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_WINDOW_CLAMP, + &kZero, sizeof(kZero)), + SyscallFailsWithErrno(EINVAL)); + + // Non-zero clamp values below MIN_SO_RCVBUF/2 should result in the clamp + // being set to MIN_SO_RCVBUF/2. + int below_half_min_so_rcvbuf = min_so_rcvbuf / 2 - 1; + EXPECT_THAT( + setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_WINDOW_CLAMP, + &below_half_min_so_rcvbuf, sizeof(below_half_min_so_rcvbuf)), + SyscallSucceeds()); + + int get = -1; + socklen_t get_len = sizeof(get); + + ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_WINDOW_CLAMP, + &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(min_so_rcvbuf / 2, get); + } +} + TEST_P(TCPSocketPairTest, TCPResetDuringClose_NoRandomSave) { DisableSave ds; // Too many syscalls. constexpr int kThreadCount = 1000; diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc index 03ee1250d..a17ff62e9 100644 --- a/test/syscalls/linux/symlink.cc +++ b/test/syscalls/linux/symlink.cc @@ -20,6 +20,7 @@ #include <string> #include "gtest/gtest.h" +#include "absl/time/clock.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" @@ -272,6 +273,30 @@ TEST(SymlinkTest, ChmodSymlink) { EXPECT_EQ(FilePermission(newpath), 0777); } +// Test that following a symlink updates the atime on the symlink. +TEST(SymlinkTest, FollowUpdatesATime) { + const auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const std::string link = NewTempAbsPath(); + EXPECT_THAT(symlink(file.path().c_str(), link.c_str()), SyscallSucceeds()); + + // Lstat the symlink. + struct stat st_before_follow; + ASSERT_THAT(lstat(link.c_str(), &st_before_follow), SyscallSucceeds()); + + // Let the clock advance. + absl::SleepFor(absl::Seconds(1)); + + // Open the file via the symlink. + int fd; + ASSERT_THAT(fd = open(link.c_str(), O_RDWR, 0666), SyscallSucceeds()); + FileDescriptor fd_closer(fd); + + // Lstat the symlink again, and check that atime is updated. + struct stat st_after_follow; + ASSERT_THAT(lstat(link.c_str(), &st_after_follow), SyscallSucceeds()); + EXPECT_LT(st_before_follow.st_atime, st_after_follow.st_atime); +} + class ParamSymlinkTest : public ::testing::TestWithParam<std::string> {}; // Test that creating an existing symlink with creat will create the target. diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc index d9c1ac0e1..a4d2953e1 100644 --- a/test/syscalls/linux/tcp_socket.cc +++ b/test/syscalls/linux/tcp_socket.cc @@ -1313,7 +1313,7 @@ TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptNeg) { int get = -1; socklen_t get_len = sizeof(get); ASSERT_THAT( - getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len), + getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len), SyscallSucceedsWithValue(0)); EXPECT_EQ(get_len, sizeof(get)); EXPECT_EQ(get, 0); @@ -1326,7 +1326,7 @@ TEST_P(SimpleTcpSocketTest, GetTCPDeferAcceptDefault) { int get = -1; socklen_t get_len = sizeof(get); ASSERT_THAT( - getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len), + getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len), SyscallSucceedsWithValue(0)); EXPECT_EQ(get_len, sizeof(get)); EXPECT_EQ(get, 0); @@ -1378,6 +1378,187 @@ TEST_P(SimpleTcpSocketTest, TCPConnectSoRcvBufRace) { SyscallSucceedsWithValue(0)); } +TEST_P(SimpleTcpSocketTest, SetTCPSynCntLessThanOne) { + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + int default_syn_cnt = get; + + { + // TCP_SYNCNT less than 1 should be rejected with an EINVAL. + constexpr int kZero = 0; + EXPECT_THAT( + setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kZero, sizeof(kZero)), + SyscallFailsWithErrno(EINVAL)); + + // TCP_SYNCNT less than 1 should be rejected with an EINVAL. + constexpr int kNeg = -1; + EXPECT_THAT( + setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kNeg, sizeof(kNeg)), + SyscallFailsWithErrno(EINVAL)); + + int get = -1; + socklen_t get_len = sizeof(get); + + ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(default_syn_cnt, get); + } +} + +TEST_P(SimpleTcpSocketTest, GetTCPSynCntDefault) { + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + int get = -1; + socklen_t get_len = sizeof(get); + constexpr int kDefaultSynCnt = 6; + + ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, kDefaultSynCnt); +} + +TEST_P(SimpleTcpSocketTest, SetTCPSynCntGreaterThanOne) { + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + constexpr int kTCPSynCnt = 20; + ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kTCPSynCnt, + sizeof(kTCPSynCnt)), + SyscallSucceeds()); + + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len), + SyscallSucceeds()); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, kTCPSynCnt); +} + +TEST_P(SimpleTcpSocketTest, SetTCPSynCntAboveMax) { + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + int default_syn_cnt = get; + { + constexpr int kTCPSynCnt = 256; + ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kTCPSynCnt, + sizeof(kTCPSynCnt)), + SyscallFailsWithErrno(EINVAL)); + + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len), + SyscallSucceeds()); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, default_syn_cnt); + } +} + +TEST_P(SimpleTcpSocketTest, SetTCPWindowClampBelowMinRcvBuf) { + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + // Discover minimum receive buf by setting a really low value + // for the receive buffer. + constexpr int kZero = 0; + EXPECT_THAT(setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &kZero, sizeof(kZero)), + SyscallSucceeds()); + + // Now retrieve the minimum value for SO_RCVBUF as the set above should + // have caused SO_RCVBUF for the socket to be set to the minimum. + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + int min_so_rcvbuf = get; + + { + // TCP_WINDOW_CLAMP less than min_so_rcvbuf/2 should be set to + // min_so_rcvbuf/2. + int below_half_min_rcvbuf = min_so_rcvbuf / 2 - 1; + EXPECT_THAT( + setsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, + &below_half_min_rcvbuf, sizeof(below_half_min_rcvbuf)), + SyscallSucceeds()); + + int get = -1; + socklen_t get_len = sizeof(get); + + ASSERT_THAT( + getsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(min_so_rcvbuf / 2, get); + } +} + +TEST_P(SimpleTcpSocketTest, SetTCPWindowClampZeroClosedSocket) { + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + constexpr int kZero = 0; + ASSERT_THAT( + setsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &kZero, sizeof(kZero)), + SyscallSucceeds()); + + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT( + getsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &get, &get_len), + SyscallSucceeds()); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, kZero); +} + +TEST_P(SimpleTcpSocketTest, SetTCPWindowClampAboveHalfMinRcvBuf) { + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + // Discover minimum receive buf by setting a really low value + // for the receive buffer. + constexpr int kZero = 0; + EXPECT_THAT(setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &kZero, sizeof(kZero)), + SyscallSucceeds()); + + // Now retrieve the minimum value for SO_RCVBUF as the set above should + // have caused SO_RCVBUF for the socket to be set to the minimum. + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + int min_so_rcvbuf = get; + + { + int above_half_min_rcv_buf = min_so_rcvbuf / 2 + 1; + EXPECT_THAT( + setsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, + &above_half_min_rcv_buf, sizeof(above_half_min_rcv_buf)), + SyscallSucceeds()); + + int get = -1; + socklen_t get_len = sizeof(get); + + ASSERT_THAT( + getsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(above_half_min_rcv_buf, get); + } +} + INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest, ::testing::Values(AF_INET, AF_INET6)); diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc index 740c7986d..42521efef 100644 --- a/test/syscalls/linux/udp_socket_test_cases.cc +++ b/test/syscalls/linux/udp_socket_test_cases.cc @@ -17,6 +17,7 @@ #include <arpa/inet.h> #include <fcntl.h> #include <netinet/in.h> +#include <poll.h> #include <sys/ioctl.h> #include <sys/socket.h> #include <sys/types.h> @@ -673,6 +674,11 @@ TEST_P(UdpSocketTest, ZerolengthWriteAllowed) { char buf[3]; // Send zero length packet from s_ to t_. ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0)); + + struct pollfd pfd = {t_, POLLIN, 0}; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), + SyscallSucceedsWithValue(1)); + // Receive the packet. char received[3]; EXPECT_THAT(read(t_, received, sizeof(received)), @@ -698,6 +704,11 @@ TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) { char buf[3]; // Send zero length packet from s_ to t_. ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0)); + + struct pollfd pfd = {t_, POLLIN, 0}; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), + SyscallSucceedsWithValue(1)); + // Receive the packet. char received[3]; EXPECT_THAT(read(t_, received, sizeof(received)), @@ -859,6 +870,10 @@ TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) { EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds()); + struct pollfd pfd = {s_, POLLIN, 0}; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), + SyscallSucceedsWithValue(1)); + // We should get the data even though read has been shutdown. EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2)); @@ -1112,6 +1127,10 @@ TEST_P(UdpSocketTest, FIONREADWriteShutdown) { ASSERT_THAT(send(s_, str, sizeof(str), 0), SyscallSucceedsWithValue(sizeof(str))); + struct pollfd pfd = {s_, POLLIN, 0}; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), + SyscallSucceedsWithValue(1)); + n = -1; EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0)); EXPECT_EQ(n, sizeof(str)); @@ -1123,6 +1142,8 @@ TEST_P(UdpSocketTest, FIONREADWriteShutdown) { EXPECT_EQ(n, sizeof(str)); } +// NOTE: Do not use `FIONREAD` as test name because it will be replaced by the +// corresponding macro and become `0x541B`. TEST_P(UdpSocketTest, Fionread) { // Bind s_ to loopback:TestPort. ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); @@ -1138,10 +1159,14 @@ TEST_P(UdpSocketTest, Fionread) { char buf[3 * psize]; RandomizeBuffer(buf, sizeof(buf)); + struct pollfd pfd = {s_, POLLIN, 0}; for (int i = 0; i < 3; ++i) { ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_), SyscallSucceedsWithValue(psize)); + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), + SyscallSucceedsWithValue(1)); + // Check that regardless of how many packets are in the queue, the size // reported is that of a single packet. n = -1; @@ -1165,10 +1190,18 @@ TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) { char buf[3 * psize]; RandomizeBuffer(buf, sizeof(buf)); + struct pollfd pfd = {s_, POLLIN, 0}; for (int i = 0; i < 3; ++i) { ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_), SyscallSucceedsWithValue(0)); + // TODO(gvisor.dev/issue/2726): sending a zero-length message to a hostinet + // socket does not cause a poll event to be triggered. + if (!IsRunningWithHostinet()) { + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), + SyscallSucceedsWithValue(1)); + } + // Check that regardless of how many packets are in the queue, the size // reported is that of a single packet. n = -1; @@ -1235,6 +1268,10 @@ TEST_P(UdpSocketTest, SoTimestamp) { // Send zero length packet from t_ to s_. ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0)); + struct pollfd pfd = {s_, POLLIN, 0}; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), + SyscallSucceedsWithValue(1)); + char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))]; msghdr msg; memset(&msg, 0, sizeof(msg)); @@ -1278,6 +1315,10 @@ TEST_P(UdpSocketTest, TimestampIoctl) { ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)), SyscallSucceedsWithValue(sizeof(buf))); + struct pollfd pfd = {s_, POLLIN, 0}; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), + SyscallSucceedsWithValue(1)); + // There should be no control messages. char recv_buf[sizeof(buf)]; ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf))); @@ -1315,6 +1356,10 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) { SyscallSucceedsWithValue(sizeof(buf))); ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0)); + struct pollfd pfd = {s_, POLLIN, 0}; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), + SyscallSucceedsWithValue(1)); + // There should be no control messages. char recv_buf[sizeof(buf)]; ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf))); @@ -1330,6 +1375,9 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) { SyscallSucceeds()); ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0)); + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), + SyscallSucceedsWithValue(1)); + // There should be a message for SO_TIMESTAMP. char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))]; msghdr msg = {}; diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc index 22e6d1a85..e647d2896 100644 --- a/test/syscalls/linux/utimes.cc +++ b/test/syscalls/linux/utimes.cc @@ -48,12 +48,15 @@ void TimeBoxed(absl::Time* before, absl::Time* after, // filesystems set it to 1, so we don't do any truncation. struct timespec ts; EXPECT_THAT(clock_gettime(CLOCK_REALTIME_COARSE, &ts), SyscallSucceeds()); - *before = absl::TimeFromTimespec(ts); + // FIXME(b/132819225): gVisor filesystem timestamps inconsistently use the + // internal or host clock, which may diverge slightly. Allow some slack on + // times to account for the difference. + *before = absl::TimeFromTimespec(ts) - absl::Seconds(1); fn(); EXPECT_THAT(clock_gettime(CLOCK_REALTIME_COARSE, &ts), SyscallSucceeds()); - *after = absl::TimeFromTimespec(ts); + *after = absl::TimeFromTimespec(ts) + absl::Seconds(1); if (*after < *before) { // Clock jumped backwards; retry. @@ -68,11 +71,11 @@ void TimeBoxed(absl::Time* before, absl::Time* after, void TestUtimesOnPath(std::string const& path) { struct stat statbuf; - struct timeval times[2] = {{1, 0}, {2, 0}}; + struct timeval times[2] = {{10, 0}, {20, 0}}; EXPECT_THAT(utimes(path.c_str(), times), SyscallSucceeds()); EXPECT_THAT(stat(path.c_str(), &statbuf), SyscallSucceeds()); - EXPECT_EQ(1, statbuf.st_atime); - EXPECT_EQ(2, statbuf.st_mtime); + EXPECT_EQ(10, statbuf.st_atime); + EXPECT_EQ(20, statbuf.st_mtime); absl::Time before; absl::Time after; @@ -103,18 +106,18 @@ TEST(UtimesTest, OnDir) { TEST(UtimesTest, MissingPath) { auto path = NewTempAbsPath(); - struct timeval times[2] = {{1, 0}, {2, 0}}; + struct timeval times[2] = {{10, 0}, {20, 0}}; EXPECT_THAT(utimes(path.c_str(), times), SyscallFailsWithErrno(ENOENT)); } void TestFutimesat(int dirFd, std::string const& path) { struct stat statbuf; - struct timeval times[2] = {{1, 0}, {2, 0}}; + struct timeval times[2] = {{10, 0}, {20, 0}}; EXPECT_THAT(futimesat(dirFd, path.c_str(), times), SyscallSucceeds()); EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds()); - EXPECT_EQ(1, statbuf.st_atime); - EXPECT_EQ(2, statbuf.st_mtime); + EXPECT_EQ(10, statbuf.st_atime); + EXPECT_EQ(20, statbuf.st_mtime); absl::Time before; absl::Time after; @@ -175,11 +178,11 @@ TEST(FutimesatTest, InvalidNsec) { void TestUtimensat(int dirFd, std::string const& path) { struct stat statbuf; - const struct timespec times[2] = {{1, 0}, {2, 0}}; + const struct timespec times[2] = {{10, 0}, {20, 0}}; EXPECT_THAT(utimensat(dirFd, path.c_str(), times, 0), SyscallSucceeds()); EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds()); - EXPECT_EQ(1, statbuf.st_atime); - EXPECT_EQ(2, statbuf.st_mtime); + EXPECT_EQ(10, statbuf.st_atime); + EXPECT_EQ(20, statbuf.st_mtime); // Test setting with UTIME_NOW and UTIME_OMIT. struct stat statbuf2; @@ -301,13 +304,13 @@ TEST(Utimensat, NullPath) { auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR)); struct stat statbuf; - const struct timespec times[2] = {{1, 0}, {2, 0}}; + const struct timespec times[2] = {{10, 0}, {20, 0}}; // Call syscall directly. EXPECT_THAT(syscall(SYS_utimensat, fd.get(), NULL, times, 0), SyscallSucceeds()); EXPECT_THAT(fstatat(0, f.path().c_str(), &statbuf, 0), SyscallSucceeds()); - EXPECT_EQ(1, statbuf.st_atime); - EXPECT_EQ(2, statbuf.st_mtime); + EXPECT_EQ(10, statbuf.st_atime); + EXPECT_EQ(20, statbuf.st_mtime); } } // namespace diff --git a/test/util/test_util.cc b/test/util/test_util.cc index 95e1e0c96..b20758626 100644 --- a/test/util/test_util.cc +++ b/test/util/test_util.cc @@ -42,12 +42,13 @@ namespace testing { #define TEST_ON_GVISOR "TEST_ON_GVISOR" #define GVISOR_NETWORK "GVISOR_NETWORK" +#define GVISOR_VFS "GVISOR_VFS" bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; } const std::string GvisorPlatform() { // Set by runner.go. - char* env = getenv(TEST_ON_GVISOR); + const char* env = getenv(TEST_ON_GVISOR); if (!env) { return Platform::kNative; } @@ -55,10 +56,19 @@ const std::string GvisorPlatform() { } bool IsRunningWithHostinet() { - char* env = getenv(GVISOR_NETWORK); + const char* env = getenv(GVISOR_NETWORK); return env && strcmp(env, "host") == 0; } +bool IsRunningWithVFS1() { + const char* env = getenv(GVISOR_VFS); + if (env == nullptr) { + // If not set, it's running on Linux. + return false; + } + return strcmp(env, "VFS1") == 0; +} + // Inline cpuid instruction. Preserve %ebx/%rbx register. In PIC compilations // %ebx contains the address of the global offset table. %rbx is occasionally // used to address stack variables in presence of dynamic allocas. diff --git a/test/util/test_util.h b/test/util/test_util.h index c5cb9d6d6..8e3245b27 100644 --- a/test/util/test_util.h +++ b/test/util/test_util.h @@ -220,6 +220,7 @@ constexpr char kKVM[] = "kvm"; bool IsRunningOnGvisor(); const std::string GvisorPlatform(); bool IsRunningWithHostinet(); +bool IsRunningWithVFS1(); #ifdef __linux__ void SetupGvisorDeathTest(); diff --git a/tools/bazel.mk b/tools/bazel.mk index 4d9bbf0ee..7cb6e393b 100644 --- a/tools/bazel.mk +++ b/tools/bazel.mk @@ -59,6 +59,8 @@ SHELL=/bin/bash -o pipefail ## DOCKER_SOCKET - The Docker socket (default: detected). ## bazel-server-start: load-default ## Starts the bazel server. + @mkdir -p $(BAZEL_CACHE) + @mkdir -p $(GCLOUD_CONFIG) docker run -d --rm \ --init \ --name $(DOCKER_NAME) \ diff --git a/tools/go_branch.sh b/tools/go_branch.sh index f97a74aaf..e568a0a76 100755 --- a/tools/go_branch.sh +++ b/tools/go_branch.sh @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -eo pipefail +set -xeo pipefail # Discovery the package name from the go.mod file. declare -r module=$(cat go.mod | grep -E "^module" | cut -d' ' -f2) @@ -42,7 +42,8 @@ declare -r head=$(git describe --always) # We expect to have an existing go branch that we will use as the basis for # this commit. That branch may be empty, but it must exist. -declare -r go_branch=$(git show-ref --hash origin/go) +git fetch --all +declare -r go_branch=$(git show-ref --hash go) # Clone the current repository to the temporary directory, and check out the # current go_branch directory. We move to the new repository for convenience. diff --git a/tools/go_generics/generics.go b/tools/go_generics/generics.go index e9cc2c753..0860ca9db 100644 --- a/tools/go_generics/generics.go +++ b/tools/go_generics/generics.go @@ -223,7 +223,9 @@ func main() { } else { switch kind { case globals.KindType, globals.KindVar, globals.KindConst, globals.KindFunction: - ident.Name = *prefix + ident.Name + *suffix + if ident.Name != "_" { + ident.Name = *prefix + ident.Name + *suffix + } case globals.KindTag: // Modify the state tag appropriately. if m := stateTagRegexp.FindStringSubmatch(ident.Name); m != nil { diff --git a/tools/issue_reviver/main.go b/tools/issue_reviver/main.go index 4256f5a6c..47c796b8a 100644 --- a/tools/issue_reviver/main.go +++ b/tools/issue_reviver/main.go @@ -20,6 +20,7 @@ import ( "fmt" "io/ioutil" "os" + "strings" "gvisor.dev/gvisor/tools/issue_reviver/github" "gvisor.dev/gvisor/tools/issue_reviver/reviver" @@ -35,14 +36,22 @@ var ( // Keep the options simple for now. Supports only a single path and repo. func init() { - flag.StringVar(&owner, "owner", "google", "Github project org/owner to look for issues") - flag.StringVar(&repo, "repo", "gvisor", "Github repo to look for issues") + flag.StringVar(&owner, "owner", "", "Github project org/owner to look for issues") + flag.StringVar(&repo, "repo", "", "Github repo to look for issues") flag.StringVar(&tokenFile, "oauth-token-file", "", "Path to file containing the OAUTH token to be used as credential to github") - flag.StringVar(&path, "path", "", "Path to scan for TODOs") + flag.StringVar(&path, "path", ".", "Path to scan for TODOs") flag.BoolVar(&dryRun, "dry-run", false, "If set to true, no changes are made to issues") } func main() { + // Set defaults from the environment. + repository := os.Getenv("GITHUB_REPOSITORY") + if parts := strings.SplitN(repository, "/", 2); len(parts) == 2 { + owner = parts[0] + repo = parts[1] + } + + // Parse flags. flag.Parse() // Check for mandatory parameters. @@ -62,8 +71,10 @@ func main() { os.Exit(1) } - // Token is passed as a file so it doesn't show up in command line arguments. - var token string + // The access token may be passed as a file so it doesn't show up in + // command line arguments. It also may be provided through the + // environment to faciliate use through GitHub's CI system. + token := os.Getenv("GITHUB_TOKEN") if len(tokenFile) != 0 { bytes, err := ioutil.ReadFile(tokenFile) if err != nil { diff --git a/tools/vm/build.sh b/tools/vm/build.sh index 5d3dc0bbf..752b2b77b 100755 --- a/tools/vm/build.sh +++ b/tools/vm/build.sh @@ -64,14 +64,14 @@ function cleanup { trap cleanup EXIT # Wait for the instance to become available (up to 5 minutes). -echo -n "Waiting for ${INSTANCE_NAME}" +echo -n "Waiting for ${INSTANCE_NAME}" >&2 declare timeout=300 declare success=0 declare internal="" declare -r start=$(date +%s) declare -r end=$((${start}+${timeout})) while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do - echo -n "." + echo -n "." >&2 if gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then success=$((${success}+1)) elif gcloud compute ssh --internal-ip --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then @@ -81,10 +81,10 @@ while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do done if [[ "${success}" -eq "0" ]]; then - echo "connect timed out after ${timeout} seconds." + echo "connect timed out after ${timeout} seconds." >&2 exit 1 else - echo "done." + echo "done." >&2 fi # Run the install scripts provided. diff --git a/tools/vm/defs.bzl b/tools/vm/defs.bzl index 61feefcbc..0f67cfa92 100644 --- a/tools/vm/defs.bzl +++ b/tools/vm/defs.bzl @@ -60,7 +60,7 @@ def _vm_image_impl(ctx): # Run the builder to generate our output. echo = ctx.actions.declare_file(ctx.label.name) resolved_inputs, argv, runfiles_manifests = ctx.resolve_command( - command = "echo -ne \"#!/bin/bash\\necho $(%s)\\n\" > %s && chmod 0755 %s" % ( + command = "echo -ne \"#!/bin/bash\\nset -e\\nimage=$(%s)\\necho ${image}\\n\" > %s && chmod 0755 %s" % ( ctx.files.builder[0].path, echo.path, echo.path, diff --git a/tools/vm/ubuntu1604/10_core.sh b/tools/vm/ubuntu1604/10_core.sh index cd518d6ac..629f7cf7a 100755 --- a/tools/vm/ubuntu1604/10_core.sh +++ b/tools/vm/ubuntu1604/10_core.sh @@ -40,4 +40,4 @@ if ! [[ -d /usr/local/go ]]; then fi # Link the Go binary from /usr/bin; replacing anything there. -(cd /usr/bin && rm -f go && sudo ln -fs /usr/local/go/bin/go go) +(cd /usr/bin && rm -f go && ln -fs /usr/local/go/bin/go go) diff --git a/tools/vm/ubuntu1604/15_gcloud.sh b/tools/vm/ubuntu1604/15_gcloud.sh new file mode 100755 index 000000000..bc2e5eccc --- /dev/null +++ b/tools/vm/ubuntu1604/15_gcloud.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeo pipefail + +# Install all essential build tools. +while true; do + if (apt-get update && apt-get install -y \ + apt-transport-https \ + ca-certificates \ + gnupg); then + break + fi + result=$? + if [[ $result -ne 100 ]]; then + exit $result + fi +done + +# Add gcloud repositories. +echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | \ + tee -a /etc/apt/sources.list.d/google-cloud-sdk.list + +# Add the appropriate key. +curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ + apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - + +# Install the gcloud SDK. +while true; do + if (apt-get update && apt-get install -y google-cloud-sdk); then + break + fi + result=$? + if [[ $result -ne 100 ]]; then + exit $result + fi +done diff --git a/tools/vm/ubuntu1604/40_kokoro.sh b/tools/vm/ubuntu1604/40_kokoro.sh index 06a1e6c48..2974f156c 100755 --- a/tools/vm/ubuntu1604/40_kokoro.sh +++ b/tools/vm/ubuntu1604/40_kokoro.sh @@ -43,14 +43,14 @@ done # junitparser is used to merge junit xml files. pip install junitparser -# We need a kbuilder user. -if useradd -c "kbuilder user" -m -s /bin/bash kbuilder; then - # User was added successfully; we add the relevant SSH keys here. - mkdir -p ~kbuilder/.ssh - (IFS=$'\n'; echo "${ssh_public_keys[*]}") > ~kbuilder/.ssh/authorized_keys - chmod 0600 ~kbuilder/.ssh/authorized_keys - chown -R kbuilder ~kbuilder/.ssh -fi +# We need a kbuilder user, which may already exist. +useradd -c "kbuilder user" -m -s /bin/bash kbuilder || true + +# We need to provision appropriate keys. +mkdir -p ~kbuilder/.ssh +(IFS=$'\n'; echo "${ssh_public_keys[*]}") > ~kbuilder/.ssh/authorized_keys +chmod 0600 ~kbuilder/.ssh/authorized_keys +chown -R kbuilder ~kbuilder/.ssh # Give passwordless sudo access. cat > /etc/sudoers.d/kokoro <<EOF diff --git a/website/BUILD b/website/BUILD index d6afd5f44..c97b2560b 100644 --- a/website/BUILD +++ b/website/BUILD @@ -138,7 +138,6 @@ docs( "//g3doc:community", "//g3doc:index", "//g3doc:roadmap", - "//g3doc/architecture_guide:index", "//g3doc/architecture_guide:performance", "//g3doc/architecture_guide:platforms", "//g3doc/architecture_guide:resources", diff --git a/website/_includes/footer.html b/website/_includes/footer.html index 5d9267f35..9cc8176f7 100644 --- a/website/_includes/footer.html +++ b/website/_includes/footer.html @@ -2,9 +2,9 @@ {% include footer-links.html %} </footer> +<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.10.1/js/all.min.js" integrity="sha256-Z1Nvg/+y2+vRFhFgFij7Lv0r77yG3hOvWz2wI0SfTa0=" crossorigin="anonymous"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script> -<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/d3/4.13.0/d3.min.js" integrity="sha256-hYXbQJK4qdJiAeDVjjQ9G0D6A0xLnDQ4eJI9dkm7Fpk=" crossorigin="anonymous"></script> {% if site.analytics %} diff --git a/website/_layouts/docs.html b/website/_layouts/docs.html index e11492915..549305089 100644 --- a/website/_layouts/docs.html +++ b/website/_layouts/docs.html @@ -47,11 +47,13 @@ categories: <h1>{{ page.title }}</h1> {% if page.editpath %} <p> - <a href="https://github.com/google/gvisor/edit/master/content/{{page.editpath}}" target="_blank"><i class="fa fa-edit fa-fw"></i> Edit this page</a> + <a href="https://github.com/google/gvisor/edit/master/{{page.editpath}}" target="_blank"><i class="fa fa-edit fa-fw"></i> Edit this page</a> <a href="https://github.com/google/gvisor/issues/new?title={{page.title | url_encode}}" target="_blank"><i class="fab fa-github fa-fw"></i> Create issue</a> </p> {% endif %} + <div class="docs-content"> {{ content }} + </div> </div> </div> </div> diff --git a/website/_sass/front.scss b/website/_sass/front.scss index 44a7e3473..0e4208f3c 100644 --- a/website/_sass/front.scss +++ b/website/_sass/front.scss @@ -4,12 +4,14 @@ background-repeat: no-repeat; background-size: cover; background-blend-mode: darken; - background-color: rgba(0, 0, 0, 0.1); + background-color: rgba(0, 0, 0, 0.3); p { color: #fff; margin-top: 0; margin-bottom: 0; font-weight: 300; + font-size: 24px; + line-height: 30px; } } diff --git a/website/_sass/style.scss b/website/_sass/style.scss index 520ea469a..4deb945d4 100644 --- a/website/_sass/style.scss +++ b/website/_sass/style.scss @@ -142,3 +142,13 @@ table th { margin-top: 10px; margin-bottom: 20px; } + +.docs-content * img { + display: block; + margin: 20px auto; +} + +.blog-content * img { + display: block; + margin: 20px auto; +} diff --git a/website/blog/2019-11-18-security-basics.md b/website/blog/2019-11-18-security-basics.md index ed6d97ffe..fbdd511dd 100644 --- a/website/blog/2019-11-18-security-basics.md +++ b/website/blog/2019-11-18-security-basics.md @@ -56,15 +56,9 @@ in combination: redundant walls, scattered draw bridges, small bottle-neck entrances, moats, etc. A simplified version of the design is below -([more detailed version](/docs/architecture_guide/))[^2]: +([more detailed version](/docs/))[^2]: --------------------------------------------------------------------------------- - -![Figure 1](/assets/images/2019-11-18-security-basics-figure1.png) - -Figure 1: Simplified design of gVisor. - --------------------------------------------------------------------------------- +![Figure 1](/assets/images/2019-11-18-security-basics-figure1.png "Simplified design of gVisor.") In order to discuss design principles, the following components are important to know: @@ -134,13 +128,7 @@ minimum level of permission is required for it to perform its function. Specifically, the closer you are to the untrusted application, the less privilege you have. --------------------------------------------------------------------------------- - -![Figure 2](/assets/images/2019-11-18-security-basics-figure2.png) - -Figure 2: runsc components and their privileges. - --------------------------------------------------------------------------------- +![Figure 2](/assets/images/2019-11-18-security-basics-figure2.png "runsc components and their privileges.") This is evident in how runsc (the drop in gVisor binary for Docker/Kubernetes) constructs the sandbox. The Sentry has the least privilege possible (it can't @@ -222,15 +210,7 @@ the host Linux syscalls. In other words, with gVisor, applications get the vast majority (and growing) functionality of Linux containers for only 68 possible syscalls to the Host OS. 350 syscalls to 68 is attack surface reduction. --------------------------------------------------------------------------------- - -![Figure 3](/assets/images/2019-11-18-security-basics-figure3.png) - -Figure 3: Reduction of Attack Surface of the Syscall Table. Note that the -Senty's Syscall Emulation Layer keeps the Containerized Process from ever -calling the Host OS. - --------------------------------------------------------------------------------- +![Figure 3](/assets/images/2019-11-18-security-basics-figure3.png "Reduction of Attack Surface of the Syscall Table. Note that the Senty's Syscall Emulation Layer keeps the Containerized Process from ever calling the Host OS.") ## Secure-by-default diff --git a/website/blog/2020-04-02-networking-security.md b/website/blog/2020-04-02-networking-security.md index 78f0a6714..5a5e38fd7 100644 --- a/website/blog/2020-04-02-networking-security.md +++ b/website/blog/2020-04-02-networking-security.md @@ -69,13 +69,7 @@ a similar syscall). Moreover, because packets typically come from off-host (e.g. the internet), the Host OS's packet processing code has received a lot of scrutiny, hopefully resulting in a high degree of hardening. --------------------------------------------------------------------------------- - -![Figure 1](/assets/images/2020-04-02-networking-security-figure1.png) - -Figure 1: Netstack and gVisor - --------------------------------------------------------------------------------- +![Figure 1](/assets/images/2020-04-02-networking-security-figure1.png "Network and gVisor.") ## Writing a network stack diff --git a/website/cmd/syscalldocs/main.go b/website/cmd/syscalldocs/main.go index 62d293a05..327537214 100644 --- a/website/cmd/syscalldocs/main.go +++ b/website/cmd/syscalldocs/main.go @@ -46,7 +46,7 @@ type SyscallDoc struct { } var mdTemplate = template.Must(template.New("out").Parse(`--- -title: {{.OS}}/{{.Arch}} +title: {{.Title}} description: Syscall Compatibility Reference Documentation for {{.OS}}/{{.Arch}} layout: docs category: Compatibility @@ -134,6 +134,7 @@ func main() { weight += 10 data := struct { + Title string OS string Arch string Weight int @@ -149,7 +150,8 @@ func main() { URLs []string } }{ - OS: strings.Title(osName), + Title: strings.Title(osName) + "/" + archName, + OS: osName, Arch: archName, Weight: weight, Total: 0, diff --git a/website/index.md b/website/index.md index 34d3ee23d..84f877d49 100644 --- a/website/index.md +++ b/website/index.md @@ -3,10 +3,10 @@ <div class="row"> <div class="col-md-3"></div> <div class="col-md-6"> - <p>gVisor is an <b>application kernel</b> and <b>container runtime</b> providing defense-in-depth for containers <em>anywhere</em>.</p> + <p>gVisor is an <b>application kernel</b> for <b>containers</b> that provides efficient defense-in-depth anywhere.</p> <p style="margin-top: 20px;"> + <a class="btn" href="/docs/user_guide/quick_start/docker/">Quick start <i class="fas fa-arrow-alt-circle-right ml-2"></i></a> <a class="btn" href="/docs/">Learn More <i class="fas fa-arrow-alt-circle-right ml-2"></i></a> - <a class="btn btn-inverse" href="https://github.com/google/gvisor">GitHub <i class="fab fa-github ml-2"></i></a> </p> </div> <div class="col-md-3"></div> @@ -19,8 +19,8 @@ <div class="row"> <div class="col-md-4"> <h4 id="seamless-security">Container-native Security <i class="fas fa-lock"></i></h4> - <p>By providing each container with its own userspace kernel, gVisor limits - the attack surface of the host. This protection does not limit + <p>By providing each container with its own application kernel, gVisor + limits the attack surface of the host. This protection does not limit functionality: gVisor runs unmodified binaries and integrates with container orchestration systems, such as Docker and Kubernetes, and supports features such as volumes and sidecars.</p> |