From 508e25b6d6e9a81edb6ddf8738450b79898b446a Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Mon, 27 Apr 2020 22:24:58 -0700 Subject: Adapt website to use g3doc sources and bazel. This adapts the merged website repository to use the image and bazel build framework. It explicitly avoids the container_image rules provided by bazel, opting instead to build with direct docker commands when necessary. The relevant build commands are incorporated into the top-level Makefile. --- g3doc/architecture_guide/BUILD | 64 ++++++ g3doc/architecture_guide/Layers.png | Bin 0 -> 11044 bytes g3doc/architecture_guide/Layers.svg | 1 + .../architecture_guide/Machine-Virtualization.png | Bin 0 -> 13205 bytes .../architecture_guide/Machine-Virtualization.svg | 1 + g3doc/architecture_guide/README.md | 80 +++++++ g3doc/architecture_guide/Rule-Based-Execution.png | Bin 0 -> 6780 bytes g3doc/architecture_guide/Rule-Based-Execution.svg | 1 + g3doc/architecture_guide/Sentry-Gofer.png | Bin 0 -> 9064 bytes g3doc/architecture_guide/Sentry-Gofer.svg | 1 + g3doc/architecture_guide/performance.md | 252 +++++++++++++++++++++ g3doc/architecture_guide/platforms.md | 86 +++++++ g3doc/architecture_guide/resources.md | 1 + g3doc/architecture_guide/security.md | 251 ++++++++++++++++++++ 14 files changed, 738 insertions(+) create mode 100644 g3doc/architecture_guide/BUILD create mode 100644 g3doc/architecture_guide/Layers.png create mode 100644 g3doc/architecture_guide/Layers.svg create mode 100644 g3doc/architecture_guide/Machine-Virtualization.png create mode 100644 g3doc/architecture_guide/Machine-Virtualization.svg create mode 100644 g3doc/architecture_guide/README.md create mode 100644 g3doc/architecture_guide/Rule-Based-Execution.png create mode 100644 g3doc/architecture_guide/Rule-Based-Execution.svg create mode 100644 g3doc/architecture_guide/Sentry-Gofer.png create mode 100644 g3doc/architecture_guide/Sentry-Gofer.svg create mode 100644 g3doc/architecture_guide/performance.md create mode 100644 g3doc/architecture_guide/platforms.md create mode 100644 g3doc/architecture_guide/resources.md create mode 100644 g3doc/architecture_guide/security.md (limited to 'g3doc/architecture_guide') diff --git a/g3doc/architecture_guide/BUILD b/g3doc/architecture_guide/BUILD new file mode 100644 index 000000000..72038305b --- /dev/null +++ b/g3doc/architecture_guide/BUILD @@ -0,0 +1,64 @@ +load("//website:defs.bzl", "doc") + +package( + default_visibility = ["//website:__pkg__"], + licenses = ["notice"], +) + +doc( + name = "index", + src = "README.md", + category = "Architecture Guide", + data = [ + "Layers.png", + "Layers.svg", + "Machine-Virtualization.png", + "Machine-Virtualization.svg", + "Rule-Based-Execution.png", + "Rule-Based-Execution.svg", + "Sentry-Gofer.png", + "Sentry-Gofer.svg", + ], + permalink = "/docs/architecture_guide/", + weight = "0", +) + +doc( + name = "platforms", + src = "platforms.md", + category = "Architecture Guide", + data = [ + "Sentry-Gofer.png", + "Sentry-Gofer.svg", + ], + permalink = "/docs/architecture_guide/platforms/", + weight = "40", +) + +doc( + name = "resources", + src = "resources.md", + category = "Architecture Guide", + permalink = "/docs/architecture_guide/resources/", + weight = "30", +) + +doc( + name = "security", + src = "security.md", + category = "Architecture Guide", + data = [ + "Layers.png", + "Layers.svg", + ], + permalink = "/docs/architecture_guide/security/", + weight = "10", +) + +doc( + name = "performance", + src = "performance.md", + category = "Architecture Guide", + permalink = "/docs/architecture_guide/performance/", + weight = "20", +) diff --git a/g3doc/architecture_guide/Layers.png b/g3doc/architecture_guide/Layers.png new file mode 100644 index 000000000..308c6c451 Binary files /dev/null and b/g3doc/architecture_guide/Layers.png differ diff --git a/g3doc/architecture_guide/Layers.svg b/g3doc/architecture_guide/Layers.svg new file mode 100644 index 000000000..0a366f841 --- /dev/null +++ b/g3doc/architecture_guide/Layers.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/architecture_guide/Machine-Virtualization.png b/g3doc/architecture_guide/Machine-Virtualization.png new file mode 100644 index 000000000..1ba2ed6b2 Binary files /dev/null and b/g3doc/architecture_guide/Machine-Virtualization.png differ diff --git a/g3doc/architecture_guide/Machine-Virtualization.svg b/g3doc/architecture_guide/Machine-Virtualization.svg new file mode 100644 index 000000000..5352da07b --- /dev/null +++ b/g3doc/architecture_guide/Machine-Virtualization.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/architecture_guide/README.md b/g3doc/architecture_guide/README.md new file mode 100644 index 000000000..ce4c4ae69 --- /dev/null +++ b/g3doc/architecture_guide/README.md @@ -0,0 +1,80 @@ +# Overview + +gVisor provides a virtualized environment in order to sandbox untrusted +containers. The system interfaces normally implemented by the host kernel are +moved into a distinct, per-sandbox user space kernel in order to minimize the +risk of an exploit. gVisor does not introduce large fixed overheads however, +and still retains a process-like model with respect to resource utilization. + +## How is this different? + +Two other approaches are commonly taken to provide stronger isolation than +native containers. + +**Machine-level virtualization**, such as [KVM][kvm] and [Xen][xen], exposes +virtualized hardware to a guest kernel via a Virtual Machine Monitor (VMM). This +virtualized hardware is generally enlightened (paravirtualized) and additional +mechanisms can be used to improve the visibility between the guest and host +(e.g. balloon drivers, paravirtualized spinlocks). Running containers in +distinct virtual machines can provide great isolation, compatibility and +performance (though nested virtualization may bring challenges in this area), +but for containers it often requires additional proxies and agents, and may +require a larger resource footprint and slower start-up times. + +![Machine-level virtualization](Machine-Virtualization.png "Machine-level virtualization") + +**Rule-based execution**, such as [seccomp][seccomp], [SELinux][selinux] and +[AppArmor][apparmor], allows the specification of a fine-grained security policy +for an application or container. These schemes typically rely on hooks +implemented inside the host kernel to enforce the rules. If the surface can be +made small enough (i.e. a sufficiently complete policy defined), then this is an +excellent way to sandbox applications and maintain native performance. However, +in practice it can be extremely difficult (if not impossible) to reliably define +a policy for arbitrary, previously unknown applications, making this approach +challenging to apply universally. + +![Rule-based execution](Rule-Based-Execution.png "Rule-based execution") + +Rule-based execution is often combined with additional layers for +defense-in-depth. + +**gVisor** provides a third isolation mechanism, distinct from those above. + +gVisor intercepts application system calls and acts as the guest kernel, without +the need for translation through virtualized hardware. gVisor may be thought of +as either a merged guest kernel and VMM, or as seccomp on steroids. This +architecture allows it to provide a flexible resource footprint (i.e. one based +on threads and memory mappings, not fixed guest physical resources) while also +lowering the fixed costs of virtualization. However, this comes at the price of +reduced application compatibility and higher per-system call overhead. + +![gVisor](Layers.png "gVisor") + +On top of this, gVisor employs rule-based execution to provide defense-in-depth +(details below). + +gVisor's approach is similar to [User Mode Linux (UML)][uml], although UML +virtualizes hardware internally and thus provides a fixed resource footprint. + +Each of the above approaches may excel in distinct scenarios. For example, +machine-level virtualization will face challenges achieving high density, while +gVisor may provide poor performance for system call heavy workloads. + +### Why Go? + +gVisor is written in [Go][golang] in order to avoid security pitfalls that can +plague kernels. With Go, there are strong types, built-in bounds checks, no +uninitialized variables, no use-after-free, no stack overflow, and a built-in +race detector. (The use of Go has its challenges too, and isn't free.) + +### What about Gofers? + + + +[apparmor]: https://wiki.ubuntu.com/AppArmor +[golang]: https://golang.org +[kvm]: https://www.linux-kvm.org +[seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt +[selinux]: https://selinuxproject.org +[uml]: http://user-mode-linux.sourceforge.net/ +[xen]: https://www.xenproject.org diff --git a/g3doc/architecture_guide/Rule-Based-Execution.png b/g3doc/architecture_guide/Rule-Based-Execution.png new file mode 100644 index 000000000..b42654a90 Binary files /dev/null and b/g3doc/architecture_guide/Rule-Based-Execution.png differ diff --git a/g3doc/architecture_guide/Rule-Based-Execution.svg b/g3doc/architecture_guide/Rule-Based-Execution.svg new file mode 100644 index 000000000..bd6717043 --- /dev/null +++ b/g3doc/architecture_guide/Rule-Based-Execution.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/architecture_guide/Sentry-Gofer.png b/g3doc/architecture_guide/Sentry-Gofer.png new file mode 100644 index 000000000..ca2c27ef7 Binary files /dev/null and b/g3doc/architecture_guide/Sentry-Gofer.png differ diff --git a/g3doc/architecture_guide/Sentry-Gofer.svg b/g3doc/architecture_guide/Sentry-Gofer.svg new file mode 100644 index 000000000..5c10750d2 --- /dev/null +++ b/g3doc/architecture_guide/Sentry-Gofer.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/architecture_guide/performance.md b/g3doc/architecture_guide/performance.md new file mode 100644 index 000000000..fd219be5e --- /dev/null +++ b/g3doc/architecture_guide/performance.md @@ -0,0 +1,252 @@ +# Performance Guide + +gVisor is designed to provide a secure, virtualized environment while preserving +key benefits of containerization, such as small fixed overheads and a dynamic +resource footprint. For containerized infrastructure, this can provide a +turn-key solution for sandboxing untrusted workloads: there are no changes to +the fundamental resource model. + +gVisor imposes runtime costs over native containers. These costs come in two +forms: additional cycles and memory usage, which may manifest as increased +latency, reduced throughput or density, or not at all. In general, these costs +come from two different sources. + +First, the existence of the [Sentry](../) means that additional memory will be +required, and application system calls must traverse additional layers of +software. The design emphasizes [security](../security/) and therefore we chose +to use a language for the Sentry that provides benefits in this domain but may +not yet offer the raw performance of other choices. Costs imposed by these +design choices are **structural costs**. + +Second, as gVisor is an independent implementation of the system call surface, +many of the subsystems or specific calls are not as optimized as more mature +implementations. A good example here is the network stack, which is continuing +to evolve but does not support all the advanced recovery mechanisms offered by +other stacks and is less CPU efficient. This is an **implementation cost** and +is distinct from **structural costs**. Improvements here are ongoing and driven +by the workloads that matter to gVisor users and contributors. + +This page provides a guide for understanding baseline performance, and calls out +distint **structural costs** and **implementation costs**, highlighting where +improvements are possible and not possible. + +While we include a variety of workloads here, it’s worth emphasizing that gVisor +may not be an appropriate solution for every workload, for reasons other than +performance. For example, a sandbox may provide minimal benefit for a trusted +database, since _user data would already be inside the sandbox_ and there is no +need for an attacker to break out in the first place. + +## Methodology + +All data below was generated using the [benchmark tools][benchmark-tools] +repository, and the machines under test are uniform [Google Compute Engine][gce] +Virtual Machines (VMs) with the following specifications: + + Machine type: n1-standard-4 (broadwell) + Image: Debian GNU/Linux 9 (stretch) 4.19.0-0 + BootDisk: 2048GB SSD persistent disk + +Through this document, `runsc` is used to indicate the runtime provided by +gVisor. When relevant, we use the name `runsc-platform` to describe a specific +[platform choice](../platforms/). + +**Except where specified, all tests below are conducted with the `ptrace` +platform. The `ptrace` platform works everywhere and does not require hardware +virtualization or kernel modifications but suffers from the highest structural +costs by far. This platform is used to provide a clear understanding of the +performance model, but in no way represents an ideal scenario. In the future, +this guide will be extended to bare metal environments and include additional +platforms.** + +## Memory access + +gVisor does not introduce any additional costs with respect to raw memory +accesses. Page faults and other Operating System (OS) mechanisms are translated +through the Sentry, but once mappings are installed and available to the +application, there is no additional overhead. + +{% include graph.html id="sysbench-memory" url="/performance/sysbench-memory.csv" title="perf.py sysbench.memory --runtime=runc --runtime=runsc" %} + +The above figure demonstrates the memory transfer rate as measured by +`sysbench`. + +## Memory usage + +The Sentry provides an additional layer of indirection, and it requires memory +in order to store state associated with the application. This memory generally +consists of a fixed component, plus an amount that varies with the usage of +operating system resources (e.g. how many sockets or files are opened). + +For many use cases, fixed memory overheads are a primary concern. This may be +because sandboxed containers handle a low volume of requests, and it is +therefore important to achieve high densities for efficiency. + +{% include graph.html id="density" url="/performance/density.csv" title="perf.py density --runtime=runc --runtime=runsc" log="true" y_min="100000" %} + +The above figure demonstrates these costs based on three sample applications. +This test is the result of running many instances of a container (50, or 5 in +the case of redis) and calculating available memory on the host before and +afterwards, and dividing the difference by the number of containers. This +technique is used for measuring memory usage over the `usage_in_bytes` value of +the container cgroup because we found that some container runtimes, other than +`runc` and `runsc`, do not use an individual container cgroup. + +The first application is an instance of `sleep`: a trivial application that does +nothing. The second application is a synthetic `node` application which imports +a number of modules and listens for requests. The third application is a similar +synthetic `ruby` application which does the same. Finally, we include an +instance of `redis` storing approximately 1GB of data. In all cases, the sandbox +itself is responsible for a small, mostly fixed amount of memory overhead. + +## CPU performance + +gVisor does not perform emulation or otherwise interfere with the raw execution +of CPU instructions by the application. Therefore, there is no runtime cost +imposed for CPU operations. + +{% include graph.html id="sysbench-cpu" url="/performance/sysbench-cpu.csv" title="perf.py sysbench.cpu --runtime=runc --runtime=runsc" %} + +The above figure demonstrates the `sysbench` measurement of CPU events per +second. Events per second is based on a CPU-bound loop that calculates all prime +numbers in a specified range. We note that `runsc` does not impose a performance +penalty, as the code is executing natively in both cases. + +This has important consequences for classes of workloads that are often +CPU-bound, such as data processing or machine learning. In these cases, `runsc` +will similarly impose minimal runtime overhead. + +{% include graph.html id="tensorflow" url="/performance/tensorflow.csv" title="perf.py tensorflow --runtime=runc --runtime=runsc" %} + +For example, the above figure shows a sample TensorFlow workload, the +[convolutional neural network example][cnn]. The time indicated includes the +full start-up and run time for the workload, which trains a model. + +## System calls + +Some **structural costs** of gVisor are heavily influenced by the [platform +choice](../platforms/), which implements system call interception. Today, gVisor +supports a variety of platforms. These platforms present distinct performance, +compatibility and security trade-offs. For example, the KVM platform has low +overhead system call interception but runs poorly with nested virtualization. + +{% include graph.html id="syscall" url="/performance/syscall.csv" title="perf.py syscall --runtime=runc --runtime=runsc-ptrace --runtime=runsc-kvm" y_min="100" log="true" %} + +The above figure demonstrates the time required for a raw system call on various +platforms. The test is implemented by a custom binary which performs a large +number of system calls and calculates the average time required. + +This cost will principally impact applications that are system call bound, which +tend to be high-performance data stores and static network services. In general, +the impact of system call interception will be lower the more work an +application does. + +{% include graph.html id="redis" url="/performance/redis.csv" title="perf.py redis --runtime=runc --runtime=runsc" %} + +For example, `redis` is an application that performs relatively little work in +userspace: in general it reads from a connected socket, reads or modifies some +data, and writes a result back to the socket. The above figure shows the results +of running [comprehensive set of benchmarks][redis-benchmark]. We can see that +small operations impose a large overhead, while larger operations, such as +`LRANGE`, where more work is done in the application, have a smaller relative +overhead. + +Some of these costs above are **structural costs**, and `redis` is likely to +remain a challenging performance scenario. However, optimizing the +[platform](../platforms/) will also have a dramatic impact. + +## Start-up time + +For many use cases, the ability to spin-up containers quickly and efficiently is +important. A sandbox may be short-lived and perform minimal user work (e.g. a +function invocation). + +{% include graph.html id="startup" url="/performance/startup.csv" title="perf.py startup --runtime=runc --runtime=runsc" %} + +The above figure indicates how total time required to start a container through +[Docker][docker]. This benchmark uses three different applications. First, an +alpine Linux-container that executes `true`. Second, a `node` application that +loads a number of modules and binds an HTTP server. The time is measured by a +successful request to the bound port. Finally, a `ruby` application that +similarly loads a number of modules and binds an HTTP server. + +> Note: most of the time overhead above is associated Docker itself. This is +> evident with the empty `runc` benchmark. To avoid these costs with `runsc`, +> you may also consider using `runsc do` mode or invoking the [OCI +> runtime](../../user_guide/quick_start/oci/) directly. + +## Network + +Networking is mostly bound by **implementation costs**, and gVisor's network stack +is improving quickly. + +While typically not an important metric in practice for common sandbox use +cases, nevertheless `iperf` is a common microbenchmark used to measure raw +throughput. + +{% include graph.html id="iperf" url="/performance/iperf.csv" title="perf.py iperf --runtime=runc --runtime=runsc" %} + +The above figure shows the result of an `iperf` test between two instances. For +the upload case, the specified runtime is used for the `iperf` client, and in +the download case, the specified runtime is the server. A native runtime is +always used for the other endpoint in the test. + +{% include graph.html id="applications" metric="requests_per_second" url="/performance/applications.csv" title="perf.py http.(node|ruby) --connections=25 --runtime=runc --runtime=runsc" %} + +The above figure shows the result of simple `node` and `ruby` web services that +render a template upon receiving a request. Because these synthetic benchmarks +do minimal work per request, must like the `redis` case, they suffer from high +overheads. In practice, the more work an application does the smaller the impact +of **structural costs** become. + +## File system + +Some aspects of file system performance are also reflective of **implementation +costs**, and an area where gVisor's implementation is improving quickly. + +In terms of raw disk I/O, gVisor does not introduce significant fundamental +overhead. For general file operations, gVisor introduces a small fixed overhead +for data that transitions across the sandbox boundary. This manifests as +**structural costs** in some cases, since these operations must be routed +through the [Gofer](../) as a result of our [security model](../security/), but +in most cases are dominated by **implementation costs**, due to an internal +[Virtual File System][vfs] (VFS) implementation that needs improvement. + +{% include graph.html id="fio-bw" url="/performance/fio.csv" title="perf.py fio --engine=sync --runtime=runc --runtime=runsc" log="true" %} + +The above figures demonstrate the results of `fio` for reads and writes to and +from the disk. In this case, the disk quickly becomes the bottleneck and +dominates other costs. + +{% include graph.html id="fio-tmpfs-bw" url="/performance/fio-tmpfs.csv" title="perf.py fio --engine=sync --runtime=runc --tmpfs=True --runtime=runsc" log="true" %} + +The above figure shows the raw I/O performance of using a `tmpfs` mount which is +sandbox-internal in the case of `runsc`. Generally these operations are +similarly bound to the cost of copying around data in-memory, and we don't see +the cost of VFS operations. + +{% include graph.html id="httpd100k" metric="transfer_rate" url="/performance/httpd100k.csv" title="perf.py http.httpd --connections=1 --connections=5 --connections=10 --connections=25 --runtime=runc --runtime=runsc" %} + +The high costs of VFS operations can manifest in benchmarks that execute many +such operations in the hot path for serving requests, for example. The above +figure shows the result of using gVisor to serve small pieces of static content +with predictably poor results. This workload represents `apache` serving a +single file sized 100k from the container image to a client running +[ApacheBench][ab] with varying levels of concurrency. The high overhead comes +principally from the VFS implementation that needs improvement, with several +internal serialization points (since all requests are reading the same file). +Note that some of some of network stack performance issues also impact this +benchmark. + +{% include graph.html id="ffmpeg" url="/performance/ffmpeg.csv" title="perf.py media.ffmpeg --runtime=runc --runtime=runsc" %} + +For benchmarks that are bound by raw disk I/O and a mix of compute, file system +operations are less of an issue. The above figure shows the total time required +for an `ffmpeg` container to start, load and transcode a 27MB input video. + +[ab]: https://en.wikipedia.org/wiki/ApacheBench +[benchmark-tools]: https://github.com/google/gvisor/tree/master/benchmarks +[gce]: https://cloud.google.com/compute/ +[cnn]: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/convolutional_network.py +[docker]: https://docker.io +[redis-benchmark]: https://redis.io/topics/benchmarks +[vfs]: https://en.wikipedia.org/wiki/Virtual_file_system diff --git a/g3doc/architecture_guide/platforms.md b/g3doc/architecture_guide/platforms.md new file mode 100644 index 000000000..1f79971d1 --- /dev/null +++ b/g3doc/architecture_guide/platforms.md @@ -0,0 +1,86 @@ +# Platform Guide + +A gVisor sandbox consists of multiple processes when running. These processes +collectively comprise a shared environment in which one or more containers can +be run. + +Each sandbox has its own isolated instance of: + +* The **Sentry**, A user-space kernel that runs the container and intercepts + and responds to system calls made by the application. + +Each container running in the sandbox has its own isolated instance of: + +* A **Gofer** which provides file system access to the container. + +![gVisor architecture diagram](Sentry-Gofer.png "gVisor architecture diagram") + +## runsc + +The entrypoint to running a sandboxed container is the `runsc` executable. +`runsc` implements the [Open Container Initiative (OCI)][oci] runtime +specification. This means that OCI compatible _filesystem bundles_ can be run by +`runsc`. Filesystem bundles are comprised of a `config.json` file containing +container configuration, and a root filesystem for the container. Please see +the [OCI runtime spec][runtime-spec] for more information on filesystem bundles. +`runsc` implements multiple commands that perform various functions such as +starting, stopping, listing, and querying the status of containers. + +## Sentry + +The Sentry is the largest component of gVisor. It can be thought of as a +userspace OS kernel. The Sentry implements all the kernel functionality needed +by the untrusted application. It implements all of the supported system calls, +signal delivery, memory management and page faulting logic, the threading +model, and more. + +When the untrusted application makes a system call, the currently used platform +redirects the call to the Sentry, which will do the necessary work to service +it. It is important to note that the Sentry will not simply pass through system +calls to the host kernel. As a userspace application, the Sentry will make some +host system calls to support its operation, but it will not allow the +application to directly control the system calls it makes. + +The Sentry aims to present an equivalent environment to (upstream) Linux v4.4. + +File system operations that extend beyond the sandbox (not internal /proc +files, pipes, etc) are sent to the Gofer, described below. + +## Platforms + +gVisor requires a platform to implement interception of syscalls, basic context +switching, and memory mapping functionality. + +### ptrace + +The ptrace platform uses `PTRACE_SYSEMU` to execute user code without allowing +it to execute host system calls. This platform can run anywhere that ptrace +works (even VMs without nested virtualization). + +### KVM (experimental) + +The KVM platform allows the Sentry to act as both guest OS and VMM, switching +back and forth between the two worlds seamlessly. The KVM platform can run on +bare-metal or in a VM with nested virtualization enabled. While there is no +virtualized hardware layer -- the sandbox retains a process model -- gVisor +leverages virtualization extensions available on modern processors in order to +improve isolation and performance of address space switches. + +## Gofer + +The Gofer is a normal host Linux process. The Gofer is started with each sandbox +and connected to the Sentry. The Sentry process is started in a restricted +seccomp container without access to file system resources. The Gofer provides +the Sentry access to file system resources via the 9P protocol and provides an +additional level of isolation. + +## Application + +The application (aka the untrusted application) is a normal Linux binary +provided to gVisor in an OCI runtime bundle. gVisor aims to provide an +environment equivalent to Linux v4.4, so applications should be able to run +unmodified. However, gVisor does not presently implement every system call, +/proc file, or /sys file so some incompatibilities may occur. + +[oci]: https://www.opencontainers.org +[runtime-spec]: https://github.com/opencontainers/runtime-spec diff --git a/g3doc/architecture_guide/resources.md b/g3doc/architecture_guide/resources.md new file mode 100644 index 000000000..7e45b58a9 --- /dev/null +++ b/g3doc/architecture_guide/resources.md @@ -0,0 +1 @@ +# Resource Model diff --git a/g3doc/architecture_guide/security.md b/g3doc/architecture_guide/security.md new file mode 100644 index 000000000..59003f0a8 --- /dev/null +++ b/g3doc/architecture_guide/security.md @@ -0,0 +1,251 @@ +# Security Model + +gVisor was created in order to provide additional defense against the +exploitation of kernel bugs by untrusted userspace code. In order to understand +how gVisor achieves this goal, it is first necessary to understand the basic +threat model. + +## Threats: The Anatomy of an Exploit + +An exploit takes advantage of a software or hardware bug in order to escalate +privileges, gain access to privileged data, or disrupt services. All of the +possible interactions that a malicious application can have with the rest of the +system (attack vectors) define the attack surface. We categorize these attack +vectors into several common classes. + +### System API + +An operating system or hypervisor exposes an abstract System API in the form of +system calls and traps. This API may be documented and stable, as with Linux, or +it may be abstracted behind a library, as with Windows (i.e. win32.dll or +ntdll.dll). The System API includes all standard interfaces that application +code uses to interact with the system. This includes high-level abstractions +that are derived from low-level system calls, such as system files, sockets and +namespaces. + +Although the System API is exposed to applications by design, bugs and race +conditions within the kernel or hypervisor may occasionally be exploitable via +the API. This is common in part due to the fact that most kernels and hypervisors +are written in [C][clang], which is well-suited to interfacing with hardware but +often prone to security issues. In order to exploit these issues, a typical attack +might involve some combination of the following: + +1. Opening or creating some combination of files, sockets or other descriptors. +1. Passing crafted, malicious arguments, structures or packets. +1. Racing with multiple threads in order to hit specific code paths. + +For example, for the [Dirty Cow][dirtycow] privilege escalation bug, an +application would open a specific file in `/proc` or use a specific `ptrace` +system call, and use multiple threads in order to trigger a race condition when +touching a fresh page of memory. The attacker then gains control over a page of +memory belonging to the system. With additional privileges or access to +privileged data in the kernel, an attacker will often be able to employ +additional techniques to gain full access to the rest of the system. + +While bugs in the implementation of the System API are readily fixed, they are +also the most common form of exploit. The exposure created by this class of +exploit is what gVisor aims to minimize and control, described in detail below. + +### System ABI + +Hardware and software exploits occasionally exist in execution paths that are +not part of an intended System API. In this case, exploits may be found as part +of implicit actions the hardware or privileged system code takes in response to +certain events, such as traps or interrupts. For example, the recent +[POPSS][popss] flaw required only native code execution (no specific system call +or file access). In that case, the Xen hypervisor was similarly vulnerable, +highlighting that hypervisors are not immune to this vector. + +### Side Channels + +Hardware side channels may be exploitable by any code running on a system: +native, sandboxed, or virtualized. However, many host-level mitigations against +hardware side channels are still effective with a sandbox. For example, kernels +built with retpoline protect against some speculative execution attacks +(Spectre) and frame poisoning may protect against L1 terminal fault (L1TF) +attacks. Hypervisors may introduce additional complications in this regard, as +there is no mitigation against an application in a normally functioning Virtual +Machine (VM) exploiting the L1TF vulnerability for another VM on the sibling +hyperthread. + +### Other Vectors + +The above categories in no way represent an exhaustive list of exploits, as we +focus only on running untrusted code from within the operating system or +hypervisor. We do not consider other ways that a more generic adversary +may interact with a system, such as inserting a portable storage device with a +malicious filesystem image, using a combination of crafted keyboard or touch +inputs, or saturating a network device with ill-formed packets. + +Furthermore, high-level systems may contain exploitable components. An attacker +need not escalate privileges within a container if there’s an exploitable +network-accessible service on the host or some other API path. *A sandbox is not +a substitute for a secure architecture*. + +## Goals: Limiting Exposure + +gVisor’s primary design goal is to minimize the System API attack vector while +still providing a process model. There are two primary security principles that +inform this design. First, the application’s direct interactions with the host +System API are intercepted by the Sentry, which implements the System API +instead. Second, the System API accessible to the Sentry itself is minimized to +a safer, restricted set. The first principle minimizes the possibility of direct +exploitation of the host System API by applications, and the second principle +minimizes indirect exploitability, which is the exploitation by an exploited or +buggy Sentry (e.g. chaining an exploit). + +The first principle is similar to the security basis for a Virtual Machine (VM). +With a VM, an application’s interactions with the host are replaced by +interactions with a guest operating system and a set of virtualized hardware +devices. These hardware devices are then implemented via the host System API by +a Virtual Machine Monitor (VMM). The Sentry similarly prevents direct interactions +by providing its own implementation of the System API that the application +must interact with. Applications are not able to to directly craft specific +arguments or flags for the host System API, or interact directly with host +primitives. + +For both the Sentry and a VMM, it’s worth noting that while direct interactions +are not possible, indirect interactions are still possible. For example, a read +on a host-backed file in the Sentry may ultimately result in a host read system +call (made by the Sentry, not by passing through arguments from the application), +similar to how a read on a block device in a VM may result in the VMM issuing +a corresponding host read system call from a backing file. + +An important distinction from a VM is that the Sentry implements a System API based +directly on host System API primitives instead of relying on virtualized hardware +and a guest operating system. This selects a distinct set of trade-offs, largely +in the performance, efficiency and compatibility domains. Since transitions in +and out of the sandbox are relatively expensive, a guest operating system will +typically take ownership of resources. For example, in the above case, the +guest operating system may read the block device data in a local page cache, +to avoid subsequent reads. This may lead to better performance but lower +efficiency, since memory may be wasted or duplicated. The Sentry opts instead +to defer to the host for many operations during runtime, for improved efficiency +but lower performance in some use cases. + +### What can a sandbox do? + +An application in a gVisor sandbox is permitted to do most things a standard +container can do: for example, applications can read and write files mapped +within the container, make network connections, etc. As described above, +gVisor's primary goal is to limit exposure to bugs and exploits while still +allowing most applications to run. Even so, gVisor will limit some operations +that might be permitted with a standard container. Even with appropriate +capabilities, a user in a gVisor sandbox will only be able to manipulate +virtualized system resources (e.g. the system time, kernel settings or +filesystem attributes) and not underlying host system resources. + +While the sandbox virtualizes many operations for the application, we limit the +sandbox's own interactions with the host to the following high-level operations: + +1. Communicate with a Gofer process via a connected socket. The sandbox may + receive new file descriptors from the Gofer process, corresponding to opened + files. These files can then be read from and written to by the sandbox. +1. Make a minimal set of host system calls. The calls do not include the + creation of new sockets (unless host networking mode is enabled) or opening + files. The calls include duplication and closing of file descriptors, + synchronization, timers and signal management. +1. Read and write packets to a virtual ethernet device. This is not required if + host networking is enabled (or networking is disabled). + +### System ABI, Side Channels and Other Vectors + +gVisor relies on the host operating system and the platform for defense against +hardware-based attacks. Given the nature of these vulnerabilities, there is +little defense that gVisor can provide (there’s no guarantee that additional +hardware measures, such as virtualization, memory encryption, etc. would +actually decrease the attack surface). Note that this is true even when using +hardware virtualization for acceleration, as the host kernel or hypervisor is +ultimately responsible for defending against attacks from within malicious +guests. + +gVisor similarly relies on the host resource mechanisms (cgroups) for defense +against resource exhaustion and denial of service attacks. Network policy +controls should be applied at the container level to ensure appropriate network +policy enforcement. Note that the sandbox itself is not capable of altering or +configuring these mechanisms, and the sandbox itself should make an attacker +less likely to exploit or override these controls through other means. + +## Principles: Defense-in-Depth + +For gVisor development, there are several engineering principles that are +employed in order to ensure that the system meets its design goals. + +1. No system call is passed through directly to the host. Every supported call + has an independent implementation in the Sentry, that is unlikely to suffer + from identical vulnerabilities that may appear in the host. This has the + consequence that all kernel features used by applications require an + implementation within the Sentry. +1. Only common, universal functionality is implemented. Some filesystems, + network devices or modules may expose specialized functionality to user + space applications via mechanisms such as extended attributes, raw sockets + or ioctls. Since the Sentry is responsible for implementing the full system + call surface, we do not implement or pass through these specialized APIs. +1. The host surface exposed to the Sentry is minimized. While the system call + surface is not trivial, it is explicitly enumerated and controlled. The + Sentry is not permitted to open new files, create new sockets or do many + other interesting things on the host. + +Additionally, we have practical restrictions that are imposed on the project to +minimize the risk of Sentry exploitability. For example: + +1. Unsafe code is carefully controlled. All unsafe code is isolated in files + that end with "unsafe.go", in order to facilitate validation and auditing. + No file without the unsafe suffix may import the unsafe package. +1. No CGo is allowed. The Sentry must be a pure Go binary. +1. External imports are not generally allowed within the core packages. Only + limited external imports are used within the setup code. The code available + inside the Sentry is carefully controlled, to ensure that the above rules + are effective. + +Finally, we recognize that security is a process, and that vigilance is +critical. Beyond our security disclosure process, the Sentry is fuzzed +continuously to identify potential bugs and races proactively, and production +crashes are recorded and triaged to similarly identify material issues. + +## FAQ + +### Is this more or less secure than a Virtual Machine? + +The security of a VM depends to a large extent on what is exposed from the host +kernel and user space support code. For example, device emulation code in the +host kernel (e.g. APIC) or optimizations (e.g. vhost) can be more complex than a +simple system call, and exploits carry the same risks. Similarly, the user space +support code is frequently unsandboxed, and exploits, while rare, may allow +unfettered access to the system. + +Some platforms leverage the same virtualization hardware as VMs in order to +provide better system call interception performance. However, gVisor does not +implement any device emulation, and instead opts to use a sandboxed host System +API directly. Both approaches significantly reduce the original attack surface. +Ultimately, since gVisor is capable of using the same hardware mechanism, one +should not assume that the mere use of virtualization hardware makes a system +more or less secure, just as it would be a mistake to make the claim that the +use of a unibody alone makes a car safe. + +### Does this stop hardware side channels? + +In general, gVisor does not provide protection against hardware side channels, +although it may make exploits that rely on direct access to the host System API +more difficult to use. To minimize exposure, you should follow relevant guidance +from vendors and keep your host kernel and firmware up-to-date. + +### Is this just a ptrace sandbox? + +No: the term “ptrace sandbox” generally refers to software that uses the Linux +ptrace facility to inspect and authorize system calls made by applications, +enforcing a specific policy. These commonly suffer from two issues. First, +vulnerable system calls may be authorized by the sandbox, as the application +still has direct access to some System API. Second, it’s impossible to avoid +time-of-check, time-of-use race conditions without disabling multi-threading. + +In gVisor, the platforms that use ptrace operate differently. The stubs that are +traced are never allowed to continue execution into the host kernel and complete +a call directly. Instead, all system calls are interpreted and handled by the +Sentry itself, who reflects resulting register state back into the tracee before +continuing execution in user space. This is very similar to the mechanism used +by User-Mode Linux (UML). + +[dirtycow]: https://en.wikipedia.org/wiki/Dirty_COW +[clang]: https://en.wikipedia.org/wiki/C_(programming_language) +[popss]: https://nvd.nist.gov/vuln/detail/CVE-2018-8897 -- cgit v1.2.3