diff options
Diffstat (limited to 'g3doc/architecture_guide')
-rw-r--r-- | g3doc/architecture_guide/BUILD | 50 | ||||
-rw-r--r-- | g3doc/architecture_guide/performance.md | 277 | ||||
-rw-r--r-- | g3doc/architecture_guide/platforms.md | 61 | ||||
-rw-r--r-- | g3doc/architecture_guide/platforms.png | bin | 21384 -> 0 bytes | |||
-rw-r--r-- | g3doc/architecture_guide/platforms.svg | 334 | ||||
-rw-r--r-- | g3doc/architecture_guide/resources.md | 144 | ||||
-rw-r--r-- | g3doc/architecture_guide/resources.png | bin | 16621 -> 0 bytes | |||
-rw-r--r-- | g3doc/architecture_guide/resources.svg | 208 | ||||
-rw-r--r-- | g3doc/architecture_guide/security.md | 255 | ||||
-rw-r--r-- | g3doc/architecture_guide/security.png | bin | 16932 -> 0 bytes | |||
-rw-r--r-- | g3doc/architecture_guide/security.svg | 153 |
11 files changed, 0 insertions, 1482 deletions
diff --git a/g3doc/architecture_guide/BUILD b/g3doc/architecture_guide/BUILD deleted file mode 100644 index 404f627a4..000000000 --- a/g3doc/architecture_guide/BUILD +++ /dev/null @@ -1,50 +0,0 @@ -load("//website:defs.bzl", "doc") - -package( - default_visibility = ["//website:__pkg__"], - licenses = ["notice"], -) - -doc( - name = "platforms", - src = "platforms.md", - category = "Architecture Guide", - data = [ - "platforms.png", - "platforms.svg", - ], - permalink = "/docs/architecture_guide/platforms/", - weight = "40", -) - -doc( - name = "resources", - src = "resources.md", - category = "Architecture Guide", - data = [ - "resources.png", - "resources.svg", - ], - permalink = "/docs/architecture_guide/resources/", - weight = "30", -) - -doc( - name = "security", - src = "security.md", - category = "Architecture Guide", - data = [ - "security.png", - "security.svg", - ], - permalink = "/docs/architecture_guide/security/", - weight = "10", -) - -doc( - name = "performance", - src = "performance.md", - category = "Architecture Guide", - permalink = "/docs/architecture_guide/performance/", - weight = "20", -) diff --git a/g3doc/architecture_guide/performance.md b/g3doc/architecture_guide/performance.md deleted file mode 100644 index b89facfd3..000000000 --- a/g3doc/architecture_guide/performance.md +++ /dev/null @@ -1,277 +0,0 @@ -# Performance Guide - -[TOC] - -gVisor is designed to provide a secure, virtualized environment while preserving -key benefits of containerization, such as small fixed overheads and a dynamic -resource footprint. For containerized infrastructure, this can provide a -turn-key solution for sandboxing untrusted workloads: there are no changes to -the fundamental resource model. - -gVisor imposes runtime costs over native containers. These costs come in two -forms: additional cycles and memory usage, which may manifest as increased -latency, reduced throughput or density, or not at all. In general, these costs -come from two different sources. - -First, the existence of the [Sentry](../README.md#sentry) means that additional -memory will be required, and application system calls must traverse additional -layers of software. The design emphasizes -[security](/docs/architecture_guide/security/) and therefore we chose to use a -language for the Sentry that provides benefits in this domain but may not yet -offer the raw performance of other choices. Costs imposed by these design -choices are **structural costs**. - -Second, as gVisor is an independent implementation of the system call surface, -many of the subsystems or specific calls are not as optimized as more mature -implementations. A good example here is the network stack, which is continuing -to evolve but does not support all the advanced recovery mechanisms offered by -other stacks and is less CPU efficient. This is an **implementation cost** and -is distinct from **structural costs**. Improvements here are ongoing and driven -by the workloads that matter to gVisor users and contributors. - -This page provides a guide for understanding baseline performance, and calls out -distinct **structural costs** and **implementation costs**, highlighting where -improvements are possible and not possible. - -While we include a variety of workloads here, it’s worth emphasizing that gVisor -may not be an appropriate solution for every workload, for reasons other than -performance. For example, a sandbox may provide minimal benefit for a trusted -database, since _user data would already be inside the sandbox_ and there is no -need for an attacker to break out in the first place. - -## Methodology - -All data below was generated using the [benchmark tools][benchmark-tools] -repository, and the machines under test are uniform [Google Compute Engine][gce] -Virtual Machines (VMs) with the following specifications: - - Machine type: n1-standard-4 (broadwell) - Image: Debian GNU/Linux 9 (stretch) 4.19.0-0 - BootDisk: 2048GB SSD persistent disk - -Through this document, `runsc` is used to indicate the runtime provided by -gVisor. When relevant, we use the name `runsc-platform` to describe a specific -[platform choice](/docs/architecture_guide/platforms/). - -**Except where specified, all tests below are conducted with the `ptrace` -platform. The `ptrace` platform works everywhere and does not require hardware -virtualization or kernel modifications but suffers from the highest structural -costs by far. This platform is used to provide a clear understanding of the -performance model, but in no way represents an ideal scenario. In the future, -this guide will be extended to bare metal environments and include additional -platforms.** - -## Memory access - -gVisor does not introduce any additional costs with respect to raw memory -accesses. Page faults and other Operating System (OS) mechanisms are translated -through the Sentry, but once mappings are installed and available to the -application, there is no additional overhead. - -{% include graph.html id="sysbench-memory" -url="/performance/sysbench-memory.csv" title="perf.py sysbench.memory ---runtime=runc --runtime=runsc" %} - -The above figure demonstrates the memory transfer rate as measured by -`sysbench`. - -## Memory usage - -The Sentry provides an additional layer of indirection, and it requires memory -in order to store state associated with the application. This memory generally -consists of a fixed component, plus an amount that varies with the usage of -operating system resources (e.g. how many sockets or files are opened). - -For many use cases, fixed memory overheads are a primary concern. This may be -because sandboxed containers handle a low volume of requests, and it is -therefore important to achieve high densities for efficiency. - -{% include graph.html id="density" url="/performance/density.csv" title="perf.py -density --runtime=runc --runtime=runsc" log="true" y_min="100000" %} - -The above figure demonstrates these costs based on three sample applications. -This test is the result of running many instances of a container (50, or 5 in -the case of redis) and calculating available memory on the host before and -afterwards, and dividing the difference by the number of containers. This -technique is used for measuring memory usage over the `usage_in_bytes` value of -the container cgroup because we found that some container runtimes, other than -`runc` and `runsc`, do not use an individual container cgroup. - -The first application is an instance of `sleep`: a trivial application that does -nothing. The second application is a synthetic `node` application which imports -a number of modules and listens for requests. The third application is a similar -synthetic `ruby` application which does the same. Finally, we include an -instance of `redis` storing approximately 1GB of data. In all cases, the sandbox -itself is responsible for a small, mostly fixed amount of memory overhead. - -## CPU performance - -gVisor does not perform emulation or otherwise interfere with the raw execution -of CPU instructions by the application. Therefore, there is no runtime cost -imposed for CPU operations. - -{% include graph.html id="sysbench-cpu" url="/performance/sysbench-cpu.csv" -title="perf.py sysbench.cpu --runtime=runc --runtime=runsc" %} - -The above figure demonstrates the `sysbench` measurement of CPU events per -second. Events per second is based on a CPU-bound loop that calculates all prime -numbers in a specified range. We note that `runsc` does not impose a performance -penalty, as the code is executing natively in both cases. - -This has important consequences for classes of workloads that are often -CPU-bound, such as data processing or machine learning. In these cases, `runsc` -will similarly impose minimal runtime overhead. - -{% include graph.html id="tensorflow" url="/performance/tensorflow.csv" -title="perf.py tensorflow --runtime=runc --runtime=runsc" %} - -For example, the above figure shows a sample TensorFlow workload, the -[convolutional neural network example][cnn]. The time indicated includes the -full start-up and run time for the workload, which trains a model. - -## System calls - -Some **structural costs** of gVisor are heavily influenced by the -[platform choice](/docs/architecture_guide/platforms/), which implements system -call interception. Today, gVisor supports a variety of platforms. These -platforms present distinct performance, compatibility and security trade-offs. -For example, the KVM platform has low overhead system call interception but runs -poorly with nested virtualization. - -{% include graph.html id="syscall" url="/performance/syscall.csv" title="perf.py -syscall --runtime=runc --runtime=runsc-ptrace --runtime=runsc-kvm" y_min="100" -log="true" %} - -The above figure demonstrates the time required for a raw system call on various -platforms. The test is implemented by a custom binary which performs a large -number of system calls and calculates the average time required. - -This cost will principally impact applications that are system call bound, which -tend to be high-performance data stores and static network services. In general, -the impact of system call interception will be lower the more work an -application does. - -{% include graph.html id="redis" url="/performance/redis.csv" title="perf.py -redis --runtime=runc --runtime=runsc" %} - -For example, `redis` is an application that performs relatively little work in -userspace: in general it reads from a connected socket, reads or modifies some -data, and writes a result back to the socket. The above figure shows the results -of running [comprehensive set of benchmarks][redis-benchmark]. We can see that -small operations impose a large overhead, while larger operations, such as -`LRANGE`, where more work is done in the application, have a smaller relative -overhead. - -Some of these costs above are **structural costs**, and `redis` is likely to -remain a challenging performance scenario. However, optimizing the -[platform](/docs/architecture_guide/platforms/) will also have a dramatic -impact. - -## Start-up time - -For many use cases, the ability to spin-up containers quickly and efficiently is -important. A sandbox may be short-lived and perform minimal user work (e.g. a -function invocation). - -{% include graph.html id="startup" url="/performance/startup.csv" title="perf.py -startup --runtime=runc --runtime=runsc" %} - -The above figure indicates how total time required to start a container through -[Docker][docker]. This benchmark uses three different applications. First, an -alpine Linux-container that executes `true`. Second, a `node` application that -loads a number of modules and binds an HTTP server. The time is measured by a -successful request to the bound port. Finally, a `ruby` application that -similarly loads a number of modules and binds an HTTP server. - -> Note: most of the time overhead above is associated Docker itself. This is -> evident with the empty `runc` benchmark. To avoid these costs with `runsc`, -> you may also consider using `runsc do` mode or invoking the -> [OCI runtime](../user_guide/quick_start/oci.md) directly. - -## Network - -Networking is mostly bound by **implementation costs**, and gVisor's network -stack is improving quickly. - -While typically not an important metric in practice for common sandbox use -cases, nevertheless `iperf` is a common microbenchmark used to measure raw -throughput. - -{% include graph.html id="iperf" url="/performance/iperf.csv" title="perf.py -iperf --runtime=runc --runtime=runsc" %} - -The above figure shows the result of an `iperf` test between two instances. For -the upload case, the specified runtime is used for the `iperf` client, and in -the download case, the specified runtime is the server. A native runtime is -always used for the other endpoint in the test. - -{% include graph.html id="applications" metric="requests_per_second" -url="/performance/applications.csv" title="perf.py http.(node|ruby) ---connections=25 --runtime=runc --runtime=runsc" %} - -The above figure shows the result of simple `node` and `ruby` web services that -render a template upon receiving a request. Because these synthetic benchmarks -do minimal work per request, much like the `redis` case, they suffer from high -overheads. In practice, the more work an application does the smaller the impact -of **structural costs** become. - -## File system - -Some aspects of file system performance are also reflective of **implementation -costs**, and an area where gVisor's implementation is improving quickly. - -In terms of raw disk I/O, gVisor does not introduce significant fundamental -overhead. For general file operations, gVisor introduces a small fixed overhead -for data that transitions across the sandbox boundary. This manifests as -**structural costs** in some cases, since these operations must be routed -through the [Gofer](../README.md#gofer) as a result of our -[Security Model](/docs/architecture_guide/security/), but in most cases are -dominated by **implementation costs**, due to an internal -[Virtual File System][vfs] (VFS) implementation that needs improvement. - -{% include graph.html id="fio-bw" url="/performance/fio.csv" title="perf.py fio ---engine=sync --runtime=runc --runtime=runsc" log="true" %} - -The above figures demonstrate the results of `fio` for reads and writes to and -from the disk. In this case, the disk quickly becomes the bottleneck and -dominates other costs. - -{% include graph.html id="fio-tmpfs-bw" url="/performance/fio-tmpfs.csv" -title="perf.py fio --engine=sync --runtime=runc --tmpfs=True --runtime=runsc" -log="true" %} - -The above figure shows the raw I/O performance of using a `tmpfs` mount which is -sandbox-internal in the case of `runsc`. Generally these operations are -similarly bound to the cost of copying around data in-memory, and we don't see -the cost of VFS operations. - -{% include graph.html id="httpd100k" metric="transfer_rate" -url="/performance/httpd100k.csv" title="perf.py http.httpd --connections=1 ---connections=5 --connections=10 --connections=25 --runtime=runc ---runtime=runsc" %} - -The high costs of VFS operations can manifest in benchmarks that execute many -such operations in the hot path for serving requests, for example. The above -figure shows the result of using gVisor to serve small pieces of static content -with predictably poor results. This workload represents `apache` serving a -single file sized 100k from the container image to a client running -[ApacheBench][ab] with varying levels of concurrency. The high overhead comes -principally from the VFS implementation that needs improvement, with several -internal serialization points (since all requests are reading the same file). -Note that some of some of network stack performance issues also impact this -benchmark. - -{% include graph.html id="ffmpeg" url="/performance/ffmpeg.csv" title="perf.py -media.ffmpeg --runtime=runc --runtime=runsc" %} - -For benchmarks that are bound by raw disk I/O and a mix of compute, file system -operations are less of an issue. The above figure shows the total time required -for an `ffmpeg` container to start, load and transcode a 27MB input video. - -[ab]: https://en.wikipedia.org/wiki/ApacheBench -[benchmark-tools]: https://github.com/google/gvisor/tree/master/test/benchmarks -[gce]: https://cloud.google.com/compute/ -[cnn]: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/convolutional_network.py -[docker]: https://docker.io -[redis-benchmark]: https://redis.io/topics/benchmarks -[vfs]: https://en.wikipedia.org/wiki/Virtual_file_system diff --git a/g3doc/architecture_guide/platforms.md b/g3doc/architecture_guide/platforms.md deleted file mode 100644 index d112c9a28..000000000 --- a/g3doc/architecture_guide/platforms.md +++ /dev/null @@ -1,61 +0,0 @@ -# Platform Guide - -[TOC] - -gVisor requires a platform to implement interception of syscalls, basic context -switching, and memory mapping functionality. Internally, gVisor uses an -abstraction sensibly called [Platform][platform]. A simplified version of this -interface looks like: - -```golang -type Platform interface { - NewAddressSpace() (AddressSpace, error) - NewContext() Context -} - -type Context interface { - Switch(as AddressSpace, ac arch.Context) (..., error) -} - -type AddressSpace interface { - MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, ...) error - Unmap(addr usermem.Addr, length uint64) -} -``` - -There are a number of different ways to implement this interface that come with -various trade-offs, generally around performance and hardware requirements. - -## Implementations - -The choice of platform depends on the context in which `runsc` is executing. In -general, virtualized platforms may be limited to platforms that do not require -hardware virtualized support (since the hardware is already in use): - -![Platforms](platforms.png "Platform examples.") - -### ptrace - -The ptrace platform uses [PTRACE_SYSEMU][ptrace] to execute user code without -allowing it to execute host system calls. This platform can run anywhere that -`ptrace` works (even VMs without nested virtualization), which is ubiquitous. - -Unfortunately, the ptrace platform has high context switch overhead, so system -call-heavy applications may pay a [performance penalty](./performance.md). - -### KVM - -The KVM platform uses the kernel's [KVM][kvm] functionality to allow the Sentry -to act as both guest OS and VMM. The KVM platform can run on bare-metal or in a -VM with nested virtualization enabled. While there is no virtualized hardware -layer -- the sandbox retains a process model -- gVisor leverages virtualization -extensions available on modern processors in order to improve isolation and -performance of address space switches. - -## Changing Platforms - -See [Changing Platforms](../user_guide/platforms.md). - -[kvm]: https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt -[platform]: https://cs.opensource.google/gvisor/gvisor/+/release-20190304.1:pkg/sentry/platform/platform.go;l=33 -[ptrace]: http://man7.org/linux/man-pages/man2/ptrace.2.html diff --git a/g3doc/architecture_guide/platforms.png b/g3doc/architecture_guide/platforms.png Binary files differdeleted file mode 100644 index 005d56feb..000000000 --- a/g3doc/architecture_guide/platforms.png +++ /dev/null diff --git a/g3doc/architecture_guide/platforms.svg b/g3doc/architecture_guide/platforms.svg deleted file mode 100644 index b0bac9ba7..000000000 --- a/g3doc/architecture_guide/platforms.svg +++ /dev/null @@ -1,334 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!-- Created with Inkscape (http://www.inkscape.org/) --> - -<svg - xmlns:dc="http://purl.org/dc/elements/1.1/" - xmlns:cc="http://creativecommons.org/ns#" - xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" - xmlns:svg="http://www.w3.org/2000/svg" - xmlns="http://www.w3.org/2000/svg" - xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" - xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="142.67763mm" - height="67.063133mm" - viewBox="0 0 142.67763 67.063134" - version="1.1" - id="svg8" - inkscape:export-filename="/home/ascannell/resources.png" - inkscape:export-xdpi="53.50127" - inkscape:export-ydpi="53.50127" - inkscape:version="0.92.4 (5da689c313, 2019-01-14)" - sodipodi:docname="platforms.svg"> - <defs - id="defs2" /> - <sodipodi:namedview - id="base" - pagecolor="#ffffff" - bordercolor="#666666" - borderopacity="1.0" - inkscape:pageopacity="0.0" - inkscape:pageshadow="2" - inkscape:zoom="0.98994949" - inkscape:cx="86.443612" - inkscape:cy="102.88104" - inkscape:document-units="mm" - inkscape:current-layer="layer1" - showgrid="false" - fit-margin-top="0" - fit-margin-left="0" - fit-margin-right="0" - fit-margin-bottom="0" - inkscape:window-width="1920" - inkscape:window-height="1005" - inkscape:window-x="0" - inkscape:window-y="0" - inkscape:window-maximized="1" /> - <metadata - id="metadata5"> - <rdf:RDF> - <cc:Work - rdf:about=""> - <dc:format>image/svg+xml</dc:format> - <dc:type - rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> - <dc:title></dc:title> - </cc:Work> - </rdf:RDF> - </metadata> - <g - inkscape:label="Layer 1" - inkscape:groupmode="layer" - id="layer1" - transform="translate(-36.081387,-98.953278)"> - <rect - id="rect10" - width="33.408691" - height="33.408691" - x="36.081387" - y="120.06757" - style="fill:#44aa00;stroke-width:0.26458332" /> - <rect - style="fill:#b3b3b3;stroke-width:0.23881446" - id="rect16" - width="142.45465" - height="10.423517" - x="36.08139" - y="155.5929" /> - <rect - id="rect10-7" - width="30.52453" - height="18.976137" - x="37.416695" - y="121.65508" - style="fill:#ff8080;stroke-width:0.19060372" /> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" - x="41.03727" - y="148.58765" - id="text65"><tspan - sodipodi:role="line" - id="tspan63" - x="41.03727" - y="148.58765" - style="stroke-width:0.08507314">gVisor</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" - x="45.473087" - y="132.50232" - id="text123"><tspan - sodipodi:role="line" - id="tspan121" - x="45.473087" - y="132.50232" - style="stroke-width:0.08327847">workload</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:6.43922186px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.16098055" - x="97.768547" - y="163.15665" - id="text163"><tspan - sodipodi:role="line" - id="tspan161" - x="97.768547" - y="163.15665" - style="stroke-width:0.16098055">host</tspan></text> - <rect - style="fill:#e9afdd;stroke-width:0.39185274" - id="rect16-7" - width="72.9646" - height="54.79026" - x="105.79441" - y="98.953278" /> - <rect - id="rect10-5" - width="33.408691" - height="33.408691" - x="108.24348" - y="100.53072" - style="fill:#44aa00;stroke-width:0.26458332" /> - <rect - id="rect10-7-6" - width="30.52453" - height="20.045216" - x="109.57877" - y="102.11823" - style="fill:#ff8080;stroke-width:0.19589928" /> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" - x="112.86765" - y="129.01863" - id="text65-2"><tspan - sodipodi:role="line" - id="tspan63-9" - x="112.86765" - y="129.01863" - style="stroke-width:0.08507314">gVisor</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" - x="117.63519" - y="114.02371" - id="text123-1"><tspan - sodipodi:role="line" - id="tspan121-2" - x="117.63519" - y="114.02371" - style="stroke-width:0.08327847">workload</tspan></text> - <rect - id="rect10-7-7" - width="11.815663" - height="8.0126781" - x="54.538059" - y="143.27702" - style="fill:#aaccff;stroke-width:0.07705856" /> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:4.35074377px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.10876859" - x="55.931114" - y="148.90578" - id="text144"><tspan - sodipodi:role="line" - id="tspan142" - x="55.931114" - y="148.90578" - style="stroke-width:0.10876859">KVM</tspan></text> - <rect - id="rect10-6" - width="33.408691" - height="33.408691" - x="71.044685" - y="119.73112" - style="fill:#44aa00;stroke-width:0.26458332" /> - <rect - id="rect10-7-0" - width="30.52453" - height="18.976137" - x="72.37999" - y="121.31865" - style="fill:#ff8080;stroke-width:0.19060372" /> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" - x="76.000565" - y="148.25128" - id="text65-6"><tspan - sodipodi:role="line" - id="tspan63-2" - x="76.000565" - y="148.25128" - style="stroke-width:0.08507314">gVisor</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" - x="80.436386" - y="132.16595" - id="text123-6"><tspan - sodipodi:role="line" - id="tspan121-1" - x="80.436386" - y="132.16595" - style="stroke-width:0.08327847">workload</tspan></text> - <rect - id="rect10-7-7-8" - width="11.815664" - height="8.0126781" - x="89.501358" - y="142.94067" - style="fill:#ffeeaa;stroke-width:0.07705856" /> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416" - x="89.92292" - y="147.89806" - id="text144-7"><tspan - sodipodi:role="line" - id="tspan142-9" - x="89.92292" - y="147.89806" - style="stroke-width:0.08486416">ptrace</tspan></text> - <rect - id="rect10-7-7-8-3" - width="11.815665" - height="8.0126781" - x="127.08897" - y="123.97878" - style="fill:#ffeeaa;stroke-width:0.07705856" /> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416" - x="127.51052" - y="128.9362" - id="text144-7-7"><tspan - sodipodi:role="line" - id="tspan142-9-5" - x="127.51052" - y="128.9362" - style="stroke-width:0.08486416">ptrace</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:5.45061255px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.13626531" - x="138.49318" - y="152.11841" - id="text229"><tspan - sodipodi:role="line" - id="tspan227" - x="138.49318" - y="152.11841" - style="stroke-width:0.13626531">VM</tspan></text> - <rect - style="fill:#b3b3b3;stroke-width:0.16518368" - id="rect16-9" - width="68.15374" - height="10.423517" - x="108.24348" - y="134.99774" /> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:6.17854786px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.15446369" - x="132.91473" - y="142.07658" - id="text248"><tspan - sodipodi:role="line" - id="tspan246" - x="132.91473" - y="142.07658" - style="stroke-width:0.15446369">guest</tspan></text> - <rect - id="rect10-5-2" - width="33.408691" - height="33.408691" - x="143.32402" - y="100.35877" - style="fill:#44aa00;stroke-width:0.26458332" /> - <rect - id="rect10-7-6-2" - width="30.52453" - height="20.045216" - x="144.65933" - y="101.94627" - style="fill:#ff8080;stroke-width:0.19589929" /> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" - x="147.94815" - y="128.84665" - id="text65-2-8"><tspan - sodipodi:role="line" - id="tspan63-9-9" - x="147.94815" - y="128.84665" - style="stroke-width:0.08507314">gVisor</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" - x="152.71565" - y="113.85176" - id="text123-1-7"><tspan - sodipodi:role="line" - id="tspan121-2-3" - x="152.71565" - y="113.85176" - style="stroke-width:0.08327847">workload</tspan></text> - <rect - id="rect10-7-7-8-3-6" - width="11.815666" - height="8.0126781" - x="162.16933" - y="123.80682" - style="fill:#ffeeaa;stroke-width:0.07705856" /> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416" - x="162.59088" - y="128.76421" - id="text144-7-7-1"><tspan - sodipodi:role="line" - id="tspan142-9-5-2" - x="162.59088" - y="128.76421" - style="stroke-width:0.08486416">ptrace</tspan></text> - </g> -</svg> diff --git a/g3doc/architecture_guide/resources.md b/g3doc/architecture_guide/resources.md deleted file mode 100644 index fc997d40c..000000000 --- a/g3doc/architecture_guide/resources.md +++ /dev/null @@ -1,144 +0,0 @@ -# Resource Model - -[TOC] - -The resource model for gVisor does not assume a fixed number of threads of -execution (i.e. vCPUs) or amount of physical memory. Where possible, decisions -about underlying physical resources are delegated to the host system, where -optimizations can be made with global information. This delegation allows the -sandbox to be highly dynamic in terms of resource usage: spanning a large number -of cores and large amount of memory when busy, and yielding those resources back -to the host when not. - -In order words, the shape of the sandbox should closely track the shape of the -sandboxed process: - -![Resource model](resources.png "Workloads of different shapes.") - -## Processes - -Much like a Virtual Machine (VM), a gVisor sandbox appears as an opaque process -on the system. Processes within the sandbox do not manifest as processes on the -host system, and process-level interactions within the sandbox require entering -the sandbox (e.g. via a [Docker exec][exec]). - -## Networking - -The sandbox attaches a network endpoint to the system, but runs its own network -stack. All network resources, other than packets in flight on the host, exist -only inside the sandbox, bound by relevant resource limits. - -You can interact with network endpoints exposed by the sandbox, just as you -would any other container, but network introspection similarly requires entering -the sandbox. - -## Files - -Files in the sandbox may be backed by different implementations. For host-native -files (where a file descriptor is available), the Gofer may return a file -descriptor to the Sentry via [SCM_RIGHTS][scmrights][^1]. - -These files may be read from and written to through standard system calls, and -also mapped into the associated application's address space. This allows the -same host memory to be shared across multiple sandboxes, although this mechanism -does not preclude the use of side-channels (see [Security Model](./security.md). - -Note that some file systems exist only within the context of the sandbox. For -example, in many cases a `tmpfs` mount will be available at `/tmp` or -`/dev/shm`, which allocates memory directly from the sandbox memory file (see -below). Ultimately, these will be accounted against relevant limits in a similar -way as the host native case. - -## Threads - -The Sentry models individual task threads with [goroutines][goroutine]. As a -result, each task thread is a lightweight [green thread][greenthread], and may -not correspond to an underlying host thread. - -However, application execution is modelled as a blocking system call with the -Sentry. This means that additional host threads may be created, *depending on -the number of active application threads*. In practice, a busy application will -converge on the number of active threads, and the host will be able to make -scheduling decisions about all application threads. - -## Time - -Time in the sandbox is provided by the Sentry, through its own [vDSO][vdso] and -time-keeping implementation. This is distinct from the host time, and no state -is shared with the host, although the time will be initialized with the host -clock. - -The Sentry runs timers to note the passage of time, much like a kernel running -on hardware (though the timers are software timers, in this case). These timers -provide updates to the vDSO, the time returned through system calls, and the -time recorded for usage or limit tracking (e.g. [RLIMIT_CPU][rlimit]). - -When all application threads are idle, the Sentry disables timers until an event -occurs that wakes either the Sentry or an application thread, similar to a -[tickless kernel][tickless]. This allows the Sentry to achieve near zero CPU -usage for idle applications. - -## Memory - -The Sentry implements its own memory management, including demand-paging and a -Sentry internal page cache for files that cannot be used natively. A single -[memfd][memfd] backs all application memory. - -### Address spaces - -The creation of address spaces is platform-specific. For some platforms, -additional "stub" processes may be created on the host in order to support -additional address spaces. These stubs are subject to various limits applied at -the sandbox level (e.g. PID limits). - -### Physical memory - -The host is able to manage physical memory using regular means (e.g. tracking -working sets, reclaiming and swapping under pressure). The Sentry lazily -populates host mappings for applications, and allow the host to demand-page -those regions, which is critical for the functioning of those mechanisms. - -In order to avoid excessive overhead, the Sentry does not demand-page individual -pages. Instead, it selects appropriate regions based on heuristics. There is a -trade-off here: the Sentry is unable to trivially determine which pages are -active and which are not. Even if pages were individually faulted, the host may -select pages to be reclaimed or swapped without the Sentry's knowledge. - -Therefore, memory usage statistics within the sandbox (e.g. via `proc`) are -approximations. The Sentry maintains an internal breakdown of memory usage, and -can collect accurate information but only through a relatively expensive API -call. In any case, it would likely be considered unwise to share precise -information about how the host is managing memory with the sandbox. - -Finally, when an application marks a region of memory as no longer needed, for -example via a call to [madvise][madvise], the Sentry *releases this memory back -to the host*. There can be performance penalties for this, since it may be -cheaper in many cases to retain the memory and use it to satisfy some other -request. However, releasing it immediately to the host allows the host to more -effectively multiplex resources and apply an efficient global policy. - -## Limits - -All Sentry threads and Sentry memory are subject to a container cgroup. However, -application usage will not appear as anonymous memory usage, and will instead be -accounted to the `memfd`. All anonymous memory will correspond to Sentry usage, -and host memory charged to the container will work as standard. - -The cgroups can be monitored for standard signals: pressure indicators, -threshold notifiers, etc. and can also be adjusted dynamically. Note that the -Sentry itself may listen for pressure signals in its containing cgroup, in order -to purge internal caches. - -[goroutine]: https://tour.golang.org/concurrency/1 -[greenthread]: https://en.wikipedia.org/wiki/Green_threads -[scheduler]: https://morsmachine.dk/go-scheduler -[vdso]: https://en.wikipedia.org/wiki/VDSO -[rlimit]: http://man7.org/linux/man-pages/man2/getrlimit.2.html -[tickless]: https://en.wikipedia.org/wiki/Tickless_kernel -[memfd]: http://man7.org/linux/man-pages/man2/memfd_create.2.html -[scmrights]: http://man7.org/linux/man-pages/man7/unix.7.html -[madvise]: http://man7.org/linux/man-pages/man2/madvise.2.html -[exec]: https://docs.docker.com/engine/reference/commandline/exec/ -[^1]: Unless host networking is enabled, the Sentry is not able to create or - open host file descriptors itself, it can only receive them in this way - from the Gofer. diff --git a/g3doc/architecture_guide/resources.png b/g3doc/architecture_guide/resources.png Binary files differdeleted file mode 100644 index f715008ec..000000000 --- a/g3doc/architecture_guide/resources.png +++ /dev/null diff --git a/g3doc/architecture_guide/resources.svg b/g3doc/architecture_guide/resources.svg deleted file mode 100644 index fd7805d90..000000000 --- a/g3doc/architecture_guide/resources.svg +++ /dev/null @@ -1,208 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!-- Created with Inkscape (http://www.inkscape.org/) --> - -<svg - xmlns:dc="http://purl.org/dc/elements/1.1/" - xmlns:cc="http://creativecommons.org/ns#" - xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" - xmlns:svg="http://www.w3.org/2000/svg" - xmlns="http://www.w3.org/2000/svg" - xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" - xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="108.24417mm" - height="47.513165mm" - viewBox="0 0 108.24417 47.513165" - version="1.1" - id="svg8" - inkscape:export-filename="/home/ascannell/resources.png" - inkscape:export-xdpi="53.50127" - inkscape:export-ydpi="53.50127" - inkscape:version="0.92.4 (5da689c313, 2019-01-14)" - sodipodi:docname="resources.svg"> - <defs - id="defs2" /> - <sodipodi:namedview - id="base" - pagecolor="#ffffff" - bordercolor="#666666" - borderopacity="1.0" - inkscape:pageopacity="0.0" - inkscape:pageshadow="2" - inkscape:zoom="0.98994949" - inkscape:cx="16.897058" - inkscape:cy="41.261746" - inkscape:document-units="mm" - inkscape:current-layer="layer1" - showgrid="false" - fit-margin-top="0" - fit-margin-left="0" - fit-margin-right="0" - fit-margin-bottom="0" - inkscape:window-width="1920" - inkscape:window-height="1005" - inkscape:window-x="0" - inkscape:window-y="0" - inkscape:window-maximized="1" /> - <metadata - id="metadata5"> - <rdf:RDF> - <cc:Work - rdf:about=""> - <dc:format>image/svg+xml</dc:format> - <dc:type - rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> - <dc:title></dc:title> - </cc:Work> - </rdf:RDF> - </metadata> - <g - inkscape:label="Layer 1" - inkscape:groupmode="layer" - id="layer1" - transform="translate(-36.081387,-118.50325)"> - <rect - id="rect10" - width="33.408691" - height="33.408691" - x="36.081387" - y="120.06757" - style="fill:#44aa00;stroke-width:0.26458332" /> - <circle - style="fill:#44aa00;stroke-width:0.21849461" - id="path12" - cx="87.958534" - cy="136.63828" - r="17.105247" /> - <path - sodipodi:type="star" - style="fill:#44aa00;stroke-width:0.26458332" - id="path14" - sodipodi:sides="3" - sodipodi:cx="124.13387" - sodipodi:cy="141.81859" - sodipodi:r1="23.31534" - sodipodi:r2="11.65767" - sodipodi:arg1="0.52359878" - sodipodi:arg2="1.5707963" - inkscape:flatsided="false" - inkscape:rounded="0" - inkscape:randomized="0" - d="m 144.32555,153.47626 -20.19168,0 -20.19167,0 10.09583,-17.48651 10.09584,-17.4865 10.09584,17.4865 z" - inkscape:transform-center-x="1.8384776e-06" - inkscape:transform-center-y="-5.8288369" /> - <rect - style="fill:#b3b3b3;stroke-width:0.20817307" - id="rect16" - width="108.24416" - height="10.423517" - x="36.08139" - y="155.5929" /> - <path - sodipodi:type="star" - style="fill:#ff8080;stroke-width:0.20018946" - id="path14-3" - sodipodi:sides="3" - sodipodi:cx="124.13387" - sodipodi:cy="139.31911" - sodipodi:r1="17.640888" - sodipodi:r2="8.8204451" - sodipodi:arg1="0.52359878" - sodipodi:arg2="1.5707963" - inkscape:flatsided="false" - inkscape:rounded="0" - inkscape:randomized="0" - d="m 139.41133,148.13955 -15.27746,0 -15.27745,0 7.63872,-13.23067 7.63873,-13.23066 7.63873,13.23066 z" - inkscape:transform-center-x="3.9117172e-06" - inkscape:transform-center-y="-4.4102243" /> - <circle - style="fill:#ff8080;stroke-width:0.18094084" - id="path12-6" - cx="87.93705" - cy="134.75125" - r="14.165282" /> - <rect - id="rect10-7" - width="30.52453" - height="25.657875" - x="37.416695" - y="121.65508" - style="fill:#ff8080;stroke-width:0.22163473" /> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" - x="47.387276" - y="151.7626" - id="text65"><tspan - sodipodi:role="line" - id="tspan63" - x="47.387276" - y="151.7626" - style="stroke-width:0.08507314">gVisor</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" - x="82.156319" - y="151.71547" - id="text65-5"><tspan - sodipodi:role="line" - id="tspan63-3" - x="82.156319" - y="151.71547" - style="stroke-width:0.08507314">gVisor</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314" - x="118.66879" - y="151.71547" - id="text65-5-5"><tspan - sodipodi:role="line" - id="tspan63-3-6" - x="118.66879" - y="151.71547" - style="stroke-width:0.08507314">gVisor</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" - x="45.473087" - y="136.20644" - id="text123"><tspan - sodipodi:role="line" - id="tspan121" - x="45.473087" - y="136.20644" - style="stroke-width:0.08327847">workload</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" - x="80.153076" - y="136.00925" - id="text123-1"><tspan - sodipodi:role="line" - id="tspan121-2" - x="80.153076" - y="136.00925" - style="stroke-width:0.08327847">workload</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847" - x="116.50173" - y="138.68195" - id="text123-1-7"><tspan - sodipodi:role="line" - id="tspan121-2-0" - x="116.50173" - y="138.68195" - style="stroke-width:0.08327847">workload</tspan></text> - <text - xml:space="preserve" - style="font-style:normal;font-weight:normal;font-size:6.43922186px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.16098055" - x="81.893562" - y="163.15665" - id="text163"><tspan - sodipodi:role="line" - id="tspan161" - x="81.893562" - y="163.15665" - style="stroke-width:0.16098055">host</tspan></text> - </g> -</svg> diff --git a/g3doc/architecture_guide/security.md b/g3doc/architecture_guide/security.md deleted file mode 100644 index 9363d834c..000000000 --- a/g3doc/architecture_guide/security.md +++ /dev/null @@ -1,255 +0,0 @@ -# Security Model - -[TOC] - -gVisor was created in order to provide additional defense against the -exploitation of kernel bugs by untrusted userspace code. In order to understand -how gVisor achieves this goal, it is first necessary to understand the basic -threat model. - -## Threats: The Anatomy of an Exploit - -An exploit takes advantage of a software or hardware bug in order to escalate -privileges, gain access to privileged data, or disrupt services. All of the -possible interactions that a malicious application can have with the rest of the -system (attack vectors) define the attack surface. We categorize these attack -vectors into several common classes. - -### System API - -An operating system or hypervisor exposes an abstract System API in the form of -system calls and traps. This API may be documented and stable, as with Linux, or -it may be abstracted behind a library, as with Windows (i.e. win32.dll or -ntdll.dll). The System API includes all standard interfaces that application -code uses to interact with the system. This includes high-level abstractions -that are derived from low-level system calls, such as system files, sockets and -namespaces. - -Although the System API is exposed to applications by design, bugs and race -conditions within the kernel or hypervisor may occasionally be exploitable via -the API. This is common in part due to the fact that most kernels and -hypervisors are written in [C][clang], which is well-suited to interfacing with -hardware but often prone to security issues. In order to exploit these issues, a -typical attack might involve some combination of the following: - -1. Opening or creating some combination of files, sockets or other descriptors. -1. Passing crafted, malicious arguments, structures or packets. -1. Racing with multiple threads in order to hit specific code paths. - -For example, for the [Dirty Cow][dirtycow] privilege escalation bug, an -application would open a specific file in `/proc` or use a specific `ptrace` -system call, and use multiple threads in order to trigger a race condition when -touching a fresh page of memory. The attacker then gains control over a page of -memory belonging to the system. With additional privileges or access to -privileged data in the kernel, an attacker will often be able to employ -additional techniques to gain full access to the rest of the system. - -While bugs in the implementation of the System API are readily fixed, they are -also the most common form of exploit. The exposure created by this class of -exploit is what gVisor aims to minimize and control, described in detail below. - -### System ABI - -Hardware and software exploits occasionally exist in execution paths that are -not part of an intended System API. In this case, exploits may be found as part -of implicit actions the hardware or privileged system code takes in response to -certain events, such as traps or interrupts. For example, the recent -[POPSS][popss] flaw required only native code execution (no specific system call -or file access). In that case, the Xen hypervisor was similarly vulnerable, -highlighting that hypervisors are not immune to this vector. - -### Side Channels - -Hardware side channels may be exploitable by any code running on a system: -native, sandboxed, or virtualized. However, many host-level mitigations against -hardware side channels are still effective with a sandbox. For example, kernels -built with retpoline protect against some speculative execution attacks -(Spectre) and frame poisoning may protect against L1 terminal fault (L1TF) -attacks. Hypervisors may introduce additional complications in this regard, as -there is no mitigation against an application in a normally functioning Virtual -Machine (VM) exploiting the L1TF vulnerability for another VM on the sibling -hyperthread. - -### Other Vectors - -The above categories in no way represent an exhaustive list of exploits, as we -focus only on running untrusted code from within the operating system or -hypervisor. We do not consider other ways that a more generic adversary may -interact with a system, such as inserting a portable storage device with a -malicious filesystem image, using a combination of crafted keyboard or touch -inputs, or saturating a network device with ill-formed packets. - -Furthermore, high-level systems may contain exploitable components. An attacker -need not escalate privileges within a container if there’s an exploitable -network-accessible service on the host or some other API path. *A sandbox is not -a substitute for a secure architecture*. - -## Goals: Limiting Exposure - -![Threat model](security.png "Threat model.") - -gVisor’s primary design goal is to minimize the System API attack vector through -multiple layers of defense, while still providing a process model. There are two -primary security principles that inform this design. First, the application’s -direct interactions with the host System API are intercepted by the Sentry, -which implements the System API instead. Second, the System API accessible to -the Sentry itself is minimized to a safer, restricted set. The first principle -minimizes the possibility of direct exploitation of the host System API by -applications, and the second principle minimizes indirect exploitability, which -is the exploitation by an exploited or buggy Sentry (e.g. chaining an exploit). - -The first principle is similar to the security basis for a Virtual Machine (VM). -With a VM, an application’s interactions with the host are replaced by -interactions with a guest operating system and a set of virtualized hardware -devices. These hardware devices are then implemented via the host System API by -a Virtual Machine Monitor (VMM). The Sentry similarly prevents direct -interactions by providing its own implementation of the System API that the -application must interact with. Applications are not able to directly craft -specific arguments or flags for the host System API, or interact directly with -host primitives. - -For both the Sentry and a VMM, it’s worth noting that while direct interactions -are not possible, indirect interactions are still possible. For example, a read -on a host-backed file in the Sentry may ultimately result in a host read system -call (made by the Sentry, not by passing through arguments from the -application), similar to how a read on a block device in a VM may result in the -VMM issuing a corresponding host read system call from a backing file. - -An important distinction from a VM is that the Sentry implements a System API -based directly on host System API primitives instead of relying on virtualized -hardware and a guest operating system. This selects a distinct set of -trade-offs, largely in the performance, efficiency and compatibility domains. -Since transitions in and out of the sandbox are relatively expensive, a guest -operating system will typically take ownership of resources. For example, in the -above case, the guest operating system may read the block device data in a local -page cache, to avoid subsequent reads. This may lead to better performance but -lower efficiency, since memory may be wasted or duplicated. The Sentry opts -instead to defer to the host for many operations during runtime, for improved -efficiency but lower performance in some use cases. - -### What can a sandbox do? - -An application in a gVisor sandbox is permitted to do most things a standard -container can do: for example, applications can read and write files mapped -within the container, make network connections, etc. As described above, -gVisor's primary goal is to limit exposure to bugs and exploits while still -allowing most applications to run. Even so, gVisor will limit some operations -that might be permitted with a standard container. Even with appropriate -capabilities, a user in a gVisor sandbox will only be able to manipulate -virtualized system resources (e.g. the system time, kernel settings or -filesystem attributes) and not underlying host system resources. - -While the sandbox virtualizes many operations for the application, we limit the -sandbox's own interactions with the host to the following high-level operations: - -1. Communicate with a Gofer process via a connected socket. The sandbox may - receive new file descriptors from the Gofer process, corresponding to opened - files. These files can then be read from and written to by the sandbox. -1. Make a minimal set of host system calls. The calls do not include the - creation of new sockets (unless host networking mode is enabled) or opening - files. The calls include duplication and closing of file descriptors, - synchronization, timers and signal management. -1. Read and write packets to a virtual ethernet device. This is not required if - host networking is enabled (or networking is disabled). - -### System ABI, Side Channels and Other Vectors - -gVisor relies on the host operating system and the platform for defense against -hardware-based attacks. Given the nature of these vulnerabilities, there is -little defense that gVisor can provide (there’s no guarantee that additional -hardware measures, such as virtualization, memory encryption, etc. would -actually decrease the attack surface). Note that this is true even when using -hardware virtualization for acceleration, as the host kernel or hypervisor is -ultimately responsible for defending against attacks from within malicious -guests. - -gVisor similarly relies on the host resource mechanisms (cgroups) for defense -against resource exhaustion and denial of service attacks. Network policy -controls should be applied at the container level to ensure appropriate network -policy enforcement. Note that the sandbox itself is not capable of altering or -configuring these mechanisms, and the sandbox itself should make an attacker -less likely to exploit or override these controls through other means. - -## Principles: Defense-in-Depth - -For gVisor development, there are several engineering principles that are -employed in order to ensure that the system meets its design goals. - -1. No system call is passed through directly to the host. Every supported call - has an independent implementation in the Sentry, that is unlikely to suffer - from identical vulnerabilities that may appear in the host. This has the - consequence that all kernel features used by applications require an - implementation within the Sentry. -1. Only common, universal functionality is implemented. Some filesystems, - network devices or modules may expose specialized functionality to user - space applications via mechanisms such as extended attributes, raw sockets - or ioctls. Since the Sentry is responsible for implementing the full system - call surface, we do not implement or pass through these specialized APIs. -1. The host surface exposed to the Sentry is minimized. While the system call - surface is not trivial, it is explicitly enumerated and controlled. The - Sentry is not permitted to open new files, create new sockets or do many - other interesting things on the host. - -Additionally, we have practical restrictions that are imposed on the project to -minimize the risk of Sentry exploitability. For example: - -1. Unsafe code is carefully controlled. All unsafe code is isolated in files - that end with "unsafe.go", in order to facilitate validation and auditing. - No file without the unsafe suffix may import the unsafe package. -1. No CGo is allowed. The Sentry must be a pure Go binary. -1. External imports are not generally allowed within the core packages. Only - limited external imports are used within the setup code. The code available - inside the Sentry is carefully controlled, to ensure that the above rules - are effective. - -Finally, we recognize that security is a process, and that vigilance is -critical. Beyond our security disclosure process, the Sentry is fuzzed -continuously to identify potential bugs and races proactively, and production -crashes are recorded and triaged to similarly identify material issues. - -## FAQ - -### Is this more or less secure than a Virtual Machine? - -The security of a VM depends to a large extent on what is exposed from the host -kernel and userspace support code. For example, device emulation code in the -host kernel (e.g. APIC) or optimizations (e.g. vhost) can be more complex than a -simple system call, and exploits carry the same risks. Similarly, the userspace -support code is frequently unsandboxed, and exploits, while rare, may allow -unfettered access to the system. - -Some platforms leverage the same virtualization hardware as VMs in order to -provide better system call interception performance. However, gVisor does not -implement any device emulation, and instead opts to use a sandboxed host System -API directly. Both approaches significantly reduce the original attack surface. -Ultimately, since gVisor is capable of using the same hardware mechanism, one -should not assume that the mere use of virtualization hardware makes a system -more or less secure, just as it would be a mistake to make the claim that the -use of a unibody alone makes a car safe. - -### Does this stop hardware side channels? - -In general, gVisor does not provide protection against hardware side channels, -although it may make exploits that rely on direct access to the host System API -more difficult to use. To minimize exposure, you should follow relevant guidance -from vendors and keep your host kernel and firmware up-to-date. - -### Is this just a ptrace sandbox? - -No: the term “ptrace sandbox” generally refers to software that uses the Linux -ptrace facility to inspect and authorize system calls made by applications, -enforcing a specific policy. These commonly suffer from two issues. First, -vulnerable system calls may be authorized by the sandbox, as the application -still has direct access to some System API. Second, it’s impossible to avoid -time-of-check, time-of-use race conditions without disabling multi-threading. - -In gVisor, the platforms that use ptrace operate differently. The stubs that are -traced are never allowed to continue execution into the host kernel and complete -a call directly. Instead, all system calls are interpreted and handled by the -Sentry itself, who reflects resulting register state back into the tracee before -continuing execution in userspace. This is very similar to the mechanism used by -User-Mode Linux (UML). - -[dirtycow]: https://en.wikipedia.org/wiki/Dirty_COW -[clang]: https://en.wikipedia.org/wiki/C_(programming_language) -[popss]: https://nvd.nist.gov/vuln/detail/CVE-2018-8897 diff --git a/g3doc/architecture_guide/security.png b/g3doc/architecture_guide/security.png Binary files differdeleted file mode 100644 index c29befbf6..000000000 --- a/g3doc/architecture_guide/security.png +++ /dev/null diff --git a/g3doc/architecture_guide/security.svg b/g3doc/architecture_guide/security.svg deleted file mode 100644 index 0575e2dec..000000000 --- a/g3doc/architecture_guide/security.svg +++ /dev/null @@ -1,153 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!-- Created with Inkscape (http://www.inkscape.org/) --> - -<svg - xmlns:dc="http://purl.org/dc/elements/1.1/" - xmlns:cc="http://creativecommons.org/ns#" - xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" - xmlns:svg="http://www.w3.org/2000/svg" - xmlns="http://www.w3.org/2000/svg" - xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" - xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="92.963379mm" - height="107.18885mm" - viewBox="0 0 92.963379 107.18885" - version="1.1" - id="svg8" - inkscape:version="0.92.4 (5da689c313, 2019-01-14)" - sodipodi:docname="defense.svg"> - <defs - id="defs2" /> - <sodipodi:namedview - id="base" - pagecolor="#ffffff" - bordercolor="#666666" - borderopacity="1.0" - inkscape:pageopacity="0.0" - inkscape:pageshadow="2" - inkscape:zoom="0.98994949" - inkscape:cx="-242.99254" - inkscape:cy="136.90181" - inkscape:document-units="mm" - inkscape:current-layer="layer4" - showgrid="false" - inkscape:object-nodes="true" - inkscape:window-width="1920" - inkscape:window-height="1005" - inkscape:window-x="0" - inkscape:window-y="0" - inkscape:window-maximized="1" - fit-margin-top="0" - fit-margin-left="0" - fit-margin-right="0" - fit-margin-bottom="0" /> - <metadata - id="metadata5"> - <rdf:RDF> - <cc:Work - rdf:about=""> - <dc:format>image/svg+xml</dc:format> - <dc:type - rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> - <dc:title></dc:title> - </cc:Work> - </rdf:RDF> - </metadata> - <g - inkscape:groupmode="layer" - id="layer2" - inkscape:label="Layer 2" - transform="translate(-61.112559,-78.160466)"> - <g - id="g4644" - style="fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576" - transform="matrix(1,0,0,-1,2.138671,277.94235)"> - <path - transform="scale(0.26458333)" - inkscape:connector-curvature="0" - style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:3.77952766;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576" - d="M 398.57227,351.84766 224.7832,452.18359 398.57227,552.51953 572.35938,452.18359 Z" - id="path4638" /> - <path - inkscape:connector-curvature="0" - style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:3.77952766;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576" - d="M 572.35938,452.18359 398.57227,552.51953 V 753.19141 L 572.35938,652.85547 Z" - transform="scale(0.26458333)" - id="path4640" /> - <path - id="path4642" - d="m 59.473888,119.64024 45.981172,26.54722 v 53.09443 L 59.473888,172.73467 Z" - style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576" - inkscape:connector-curvature="0" /> - </g> - </g> - <g - inkscape:groupmode="layer" - id="layer3" - inkscape:label="Layer 3" - transform="translate(-61.112559,-78.160466)"> - <g - id="g4554" - transform="matrix(-0.39771468,0.69855937,-0.69855937,-0.39771468,366.58103,126.65261)"> - <g - id="g4662" - transform="translate(59.46839,130.66062)"> - <path - inkscape:connector-curvature="0" - id="path4548" - transform="scale(0.26458333)" - d="M 398.57227,351.84766 224.7832,452.18359 398.57227,552.51953 572.35938,452.18359 Z" - style="opacity:1;fill:#0066ff;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:4.70182848;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> - <path - inkscape:connector-curvature="0" - id="path4550" - transform="scale(0.26458333)" - d="M 572.35938,452.18359 398.57227,552.51953 V 753.19141 L 572.35938,652.85547 Z" - style="opacity:1;fill:#0044aa;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:4.29276943;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> - <path - inkscape:connector-curvature="0" - style="opacity:1;fill:#5599ff;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:1.24402535;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" - d="m 59.473888,119.64024 45.981172,26.54722 v 53.09443 L 59.473888,172.73467 Z" - id="path4552" /> - </g> - </g> - </g> - <g - inkscape:groupmode="layer" - id="layer4" - inkscape:label="Layer 4" - transform="translate(-61.112559,-78.160466)"> - <path - style="fill:#e000ae;fill-opacity:1;stroke-width:0.12476727" - d="m 84.610811,107.36071 v 2.55773 2.55772 h 2.49535 2.49534 v -2.55772 -2.55773 h -2.49534 z m 40.674129,0 v 2.55773 2.55772 h 2.49535 2.49534 v -2.55772 -2.55773 h -2.49534 z m -35.558669,5.11545 v 2.55773 2.55773 h 2.49535 2.49534 v -2.55773 -2.55773 h -2.49534 z m 4.99069,5.11546 v 2.55773 2.55773 h -2.49534 -2.49535 v 2.49534 2.49535 h -2.55773 -2.55773 v 2.55773 2.55773 h -2.55773 -2.55773 v 10.16853 10.16853 h 2.55773 2.55773 v -7.67562 -7.67587 l 2.52654,0.0339 2.52654,0.0336 0.0327,5.08427 0.0327,5.08426 h 2.49388 2.49388 v 2.55919 2.5592 l 5.08427,-0.0327 5.084269,-0.0326 v -2.49534 -2.49535 l -5.084269,-0.0324 -5.08427,-0.0327 v -2.55626 -2.55651 h 12.726269 12.72626 v 2.55651 2.55626 l -5.05868,0.0327 -5.05893,0.0324 v 2.49535 2.49534 l 5.05893,0.0326 5.05868,0.0327 v -2.55919 -2.55919 h 2.49388 2.49413 l 0.0324,-5.08426 0.0327,-5.08427 2.52653,-0.0336 2.52654,-0.0339 v 7.67586 7.67563 h 2.55773 2.55773 v -10.16854 -10.16853 h -2.55773 -2.55773 v -2.55773 -2.55773 h -2.55773 -2.55773 v -2.49535 -2.49534 h -2.49535 -2.49534 v -2.55773 -2.55773 h -2.55773 -2.55773 v 2.55773 2.55773 h -7.6108 -7.610809 v -2.55773 -2.55773 h -2.55774 z m 25.452519,0 h 2.49535 2.49535 v -2.55773 -2.55773 h -2.49535 -2.49535 v 2.55773 z m -25.452519,10.10615 h 5.11546 5.115459 v 2.55773 2.55773 h -5.115459 -5.11546 v -2.55773 z m 15.221609,0 h 5.11546 5.11545 v 2.55773 2.55773 h -5.11545 -5.11546 v -2.55773 z" - id="path4732" - inkscape:connector-curvature="0" /> - </g> - <g - inkscape:label="Layer 1" - inkscape:groupmode="layer" - id="layer1" - style="display:inline" - transform="translate(-61.112559,-78.160466)"> - <g - transform="translate(-131.49557,42.495842)" - style="fill:#007200;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" - id="g4628"> - <path - id="path4529" - d="m 239.09034,36.164616 -45.98169,26.547215 45.98169,26.547217 45.98117,-26.547217 z" - style="opacity:1;fill:#4aba19;fill-opacity:0.34509804;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" - inkscape:connector-curvature="0" /> - <path - id="path4531" - d="m 285.07151,62.711828 -45.98117,26.54722 v 53.094432 l 45.98117,-26.54722 z" - style="opacity:1;fill:#007900;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" - inkscape:connector-curvature="0" /> - <path - inkscape:connector-curvature="0" - style="opacity:1;fill:#003d00;fill-opacity:0.34509804;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" - d="m 193.10865,62.711831 45.98117,26.54722 v 53.094429 l -45.98117,-26.54722 z" - id="path4541" /> - </g> - </g> -</svg> |