From 508e25b6d6e9a81edb6ddf8738450b79898b446a Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Mon, 27 Apr 2020 22:24:58 -0700 Subject: Adapt website to use g3doc sources and bazel. This adapts the merged website repository to use the image and bazel build framework. It explicitly avoids the container_image rules provided by bazel, opting instead to build with direct docker commands when necessary. The relevant build commands are incorporated into the top-level Makefile. --- g3doc/BUILD | 37 +++ g3doc/README.md | 29 ++- g3doc/architecture_guide/BUILD | 64 ++++++ g3doc/architecture_guide/Layers.png | Bin 0 -> 11044 bytes g3doc/architecture_guide/Layers.svg | 1 + .../architecture_guide/Machine-Virtualization.png | Bin 0 -> 13205 bytes .../architecture_guide/Machine-Virtualization.svg | 1 + g3doc/architecture_guide/README.md | 80 +++++++ g3doc/architecture_guide/Rule-Based-Execution.png | Bin 0 -> 6780 bytes g3doc/architecture_guide/Rule-Based-Execution.svg | 1 + g3doc/architecture_guide/Sentry-Gofer.png | Bin 0 -> 9064 bytes g3doc/architecture_guide/Sentry-Gofer.svg | 1 + g3doc/architecture_guide/performance.md | 252 +++++++++++++++++++++ g3doc/architecture_guide/platforms.md | 86 +++++++ g3doc/architecture_guide/resources.md | 1 + g3doc/architecture_guide/security.md | 251 ++++++++++++++++++++ g3doc/community.md | 31 +++ g3doc/logo.txt | 1 + g3doc/roadmap.md | 48 ++++ g3doc/user_guide/BUILD | 70 ++++++ g3doc/user_guide/FAQ.md | 112 +++++++++ g3doc/user_guide/checkpoint_restore.md | 99 ++++++++ g3doc/user_guide/compatibility.md | 87 +++++++ g3doc/user_guide/debugging.md | 129 +++++++++++ g3doc/user_guide/filesystem.md | 57 +++++ g3doc/user_guide/install.md | 158 +++++++++++++ g3doc/user_guide/networking.md | 83 +++++++ g3doc/user_guide/platforms.md | 116 ++++++++++ g3doc/user_guide/quick_start/BUILD | 33 +++ g3doc/user_guide/quick_start/docker.md | 91 ++++++++ g3doc/user_guide/quick_start/kubernetes.md | 36 +++ g3doc/user_guide/quick_start/oci.md | 44 ++++ g3doc/user_guide/tutorials/BUILD | 37 +++ g3doc/user_guide/tutorials/add-node-pool.png | Bin 0 -> 70208 bytes g3doc/user_guide/tutorials/cni.md | 172 ++++++++++++++ g3doc/user_guide/tutorials/docker.md | 68 ++++++ g3doc/user_guide/tutorials/kubernetes.md | 234 +++++++++++++++++++ g3doc/user_guide/tutorials/node-pool-button.png | Bin 0 -> 13757 bytes 38 files changed, 2508 insertions(+), 2 deletions(-) create mode 100644 g3doc/BUILD create mode 100644 g3doc/architecture_guide/BUILD create mode 100644 g3doc/architecture_guide/Layers.png create mode 100644 g3doc/architecture_guide/Layers.svg create mode 100644 g3doc/architecture_guide/Machine-Virtualization.png create mode 100644 g3doc/architecture_guide/Machine-Virtualization.svg create mode 100644 g3doc/architecture_guide/README.md create mode 100644 g3doc/architecture_guide/Rule-Based-Execution.png create mode 100644 g3doc/architecture_guide/Rule-Based-Execution.svg create mode 100644 g3doc/architecture_guide/Sentry-Gofer.png create mode 100644 g3doc/architecture_guide/Sentry-Gofer.svg create mode 100644 g3doc/architecture_guide/performance.md create mode 100644 g3doc/architecture_guide/platforms.md create mode 100644 g3doc/architecture_guide/resources.md create mode 100644 g3doc/architecture_guide/security.md create mode 100644 g3doc/community.md create mode 100644 g3doc/logo.txt create mode 100644 g3doc/roadmap.md create mode 100644 g3doc/user_guide/BUILD create mode 100644 g3doc/user_guide/FAQ.md create mode 100644 g3doc/user_guide/checkpoint_restore.md create mode 100644 g3doc/user_guide/compatibility.md create mode 100644 g3doc/user_guide/debugging.md create mode 100644 g3doc/user_guide/filesystem.md create mode 100644 g3doc/user_guide/install.md create mode 100644 g3doc/user_guide/networking.md create mode 100644 g3doc/user_guide/platforms.md create mode 100644 g3doc/user_guide/quick_start/BUILD create mode 100644 g3doc/user_guide/quick_start/docker.md create mode 100644 g3doc/user_guide/quick_start/kubernetes.md create mode 100644 g3doc/user_guide/quick_start/oci.md create mode 100644 g3doc/user_guide/tutorials/BUILD create mode 100644 g3doc/user_guide/tutorials/add-node-pool.png create mode 100644 g3doc/user_guide/tutorials/cni.md create mode 100644 g3doc/user_guide/tutorials/docker.md create mode 100644 g3doc/user_guide/tutorials/kubernetes.md create mode 100644 g3doc/user_guide/tutorials/node-pool-button.png (limited to 'g3doc') diff --git a/g3doc/BUILD b/g3doc/BUILD new file mode 100644 index 000000000..be3653c93 --- /dev/null +++ b/g3doc/BUILD @@ -0,0 +1,37 @@ +load("//website:defs.bzl", "doc") + +package( + default_visibility = ["//website:__pkg__"], + licenses = ["notice"], +) + +doc( + name = "index", + src = "README.md", + permalink = "/docs/", + weight = "0", +) + +doc( + name = "roadmap", + src = "roadmap.md", + category = "Project", + permalink = "/roadmap/", + weight = "10", +) + +doc( + name = "basics", + src = "basics.md", + category = "Project", + permalink = "/docs/basics/", +) + +doc( + name = "community", + src = "community.md", + category = "Project", + permalink = "/community/", + subcategory = "Community", + weight = "95", +) diff --git a/g3doc/README.md b/g3doc/README.md index 49d58cdae..7c582ba79 100644 --- a/g3doc/README.md +++ b/g3doc/README.md @@ -1,2 +1,27 @@ -The gVisor logo files are licensed under CC BY-SA 4.0 (Creative Commons -Attribution-ShareAlike 4.0 International). +# What is gVisor? + +gVisor is a user-space kernel, written in Go, that implements a substantial +portion of the [Linux system call interface][linux]. It provides an additional +layer of isolation between running applications and the host operating system. + +gVisor includes an [Open Container Initiative (OCI)][oci] runtime called `runsc` +that makes it easy to work with existing container tooling. The `runsc` runtime +integrates with Docker and Kubernetes, making it simple to run sandboxed +containers. + +gVisor takes a distinct approach to container sandboxing and makes a different +set of technical trade-offs compared to existing sandbox technologies, thus +providing new tools and ideas for the container security landscape. + +gVisor can be used with Docker, Kubernetes, or directly using `runsc`. Use the +links below to see detailed instructions for each of them: + +* [Docker](./user_guide/quick_start/docker/): The quickest and easiest way to + get started. +* [Kubernetes](./user_guide/quick_start/kubernetes/): Isolate Pods in your K8s + cluster with gVisor. +* [OCI Quick Start](./user_guide/quick_start/oci/): Expert mode. Customize + gVisor for your environment. + +[linux]: https://en.wikipedia.org/wiki/Linux_kernel_interfaces +[oci]: https://www.opencontainers.org diff --git a/g3doc/architecture_guide/BUILD b/g3doc/architecture_guide/BUILD new file mode 100644 index 000000000..72038305b --- /dev/null +++ b/g3doc/architecture_guide/BUILD @@ -0,0 +1,64 @@ +load("//website:defs.bzl", "doc") + +package( + default_visibility = ["//website:__pkg__"], + licenses = ["notice"], +) + +doc( + name = "index", + src = "README.md", + category = "Architecture Guide", + data = [ + "Layers.png", + "Layers.svg", + "Machine-Virtualization.png", + "Machine-Virtualization.svg", + "Rule-Based-Execution.png", + "Rule-Based-Execution.svg", + "Sentry-Gofer.png", + "Sentry-Gofer.svg", + ], + permalink = "/docs/architecture_guide/", + weight = "0", +) + +doc( + name = "platforms", + src = "platforms.md", + category = "Architecture Guide", + data = [ + "Sentry-Gofer.png", + "Sentry-Gofer.svg", + ], + permalink = "/docs/architecture_guide/platforms/", + weight = "40", +) + +doc( + name = "resources", + src = "resources.md", + category = "Architecture Guide", + permalink = "/docs/architecture_guide/resources/", + weight = "30", +) + +doc( + name = "security", + src = "security.md", + category = "Architecture Guide", + data = [ + "Layers.png", + "Layers.svg", + ], + permalink = "/docs/architecture_guide/security/", + weight = "10", +) + +doc( + name = "performance", + src = "performance.md", + category = "Architecture Guide", + permalink = "/docs/architecture_guide/performance/", + weight = "20", +) diff --git a/g3doc/architecture_guide/Layers.png b/g3doc/architecture_guide/Layers.png new file mode 100644 index 000000000..308c6c451 Binary files /dev/null and b/g3doc/architecture_guide/Layers.png differ diff --git a/g3doc/architecture_guide/Layers.svg b/g3doc/architecture_guide/Layers.svg new file mode 100644 index 000000000..0a366f841 --- /dev/null +++ b/g3doc/architecture_guide/Layers.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/architecture_guide/Machine-Virtualization.png b/g3doc/architecture_guide/Machine-Virtualization.png new file mode 100644 index 000000000..1ba2ed6b2 Binary files /dev/null and b/g3doc/architecture_guide/Machine-Virtualization.png differ diff --git a/g3doc/architecture_guide/Machine-Virtualization.svg b/g3doc/architecture_guide/Machine-Virtualization.svg new file mode 100644 index 000000000..5352da07b --- /dev/null +++ b/g3doc/architecture_guide/Machine-Virtualization.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/architecture_guide/README.md b/g3doc/architecture_guide/README.md new file mode 100644 index 000000000..ce4c4ae69 --- /dev/null +++ b/g3doc/architecture_guide/README.md @@ -0,0 +1,80 @@ +# Overview + +gVisor provides a virtualized environment in order to sandbox untrusted +containers. The system interfaces normally implemented by the host kernel are +moved into a distinct, per-sandbox user space kernel in order to minimize the +risk of an exploit. gVisor does not introduce large fixed overheads however, +and still retains a process-like model with respect to resource utilization. + +## How is this different? + +Two other approaches are commonly taken to provide stronger isolation than +native containers. + +**Machine-level virtualization**, such as [KVM][kvm] and [Xen][xen], exposes +virtualized hardware to a guest kernel via a Virtual Machine Monitor (VMM). This +virtualized hardware is generally enlightened (paravirtualized) and additional +mechanisms can be used to improve the visibility between the guest and host +(e.g. balloon drivers, paravirtualized spinlocks). Running containers in +distinct virtual machines can provide great isolation, compatibility and +performance (though nested virtualization may bring challenges in this area), +but for containers it often requires additional proxies and agents, and may +require a larger resource footprint and slower start-up times. + +![Machine-level virtualization](Machine-Virtualization.png "Machine-level virtualization") + +**Rule-based execution**, such as [seccomp][seccomp], [SELinux][selinux] and +[AppArmor][apparmor], allows the specification of a fine-grained security policy +for an application or container. These schemes typically rely on hooks +implemented inside the host kernel to enforce the rules. If the surface can be +made small enough (i.e. a sufficiently complete policy defined), then this is an +excellent way to sandbox applications and maintain native performance. However, +in practice it can be extremely difficult (if not impossible) to reliably define +a policy for arbitrary, previously unknown applications, making this approach +challenging to apply universally. + +![Rule-based execution](Rule-Based-Execution.png "Rule-based execution") + +Rule-based execution is often combined with additional layers for +defense-in-depth. + +**gVisor** provides a third isolation mechanism, distinct from those above. + +gVisor intercepts application system calls and acts as the guest kernel, without +the need for translation through virtualized hardware. gVisor may be thought of +as either a merged guest kernel and VMM, or as seccomp on steroids. This +architecture allows it to provide a flexible resource footprint (i.e. one based +on threads and memory mappings, not fixed guest physical resources) while also +lowering the fixed costs of virtualization. However, this comes at the price of +reduced application compatibility and higher per-system call overhead. + +![gVisor](Layers.png "gVisor") + +On top of this, gVisor employs rule-based execution to provide defense-in-depth +(details below). + +gVisor's approach is similar to [User Mode Linux (UML)][uml], although UML +virtualizes hardware internally and thus provides a fixed resource footprint. + +Each of the above approaches may excel in distinct scenarios. For example, +machine-level virtualization will face challenges achieving high density, while +gVisor may provide poor performance for system call heavy workloads. + +### Why Go? + +gVisor is written in [Go][golang] in order to avoid security pitfalls that can +plague kernels. With Go, there are strong types, built-in bounds checks, no +uninitialized variables, no use-after-free, no stack overflow, and a built-in +race detector. (The use of Go has its challenges too, and isn't free.) + +### What about Gofers? + + + +[apparmor]: https://wiki.ubuntu.com/AppArmor +[golang]: https://golang.org +[kvm]: https://www.linux-kvm.org +[seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt +[selinux]: https://selinuxproject.org +[uml]: http://user-mode-linux.sourceforge.net/ +[xen]: https://www.xenproject.org diff --git a/g3doc/architecture_guide/Rule-Based-Execution.png b/g3doc/architecture_guide/Rule-Based-Execution.png new file mode 100644 index 000000000..b42654a90 Binary files /dev/null and b/g3doc/architecture_guide/Rule-Based-Execution.png differ diff --git a/g3doc/architecture_guide/Rule-Based-Execution.svg b/g3doc/architecture_guide/Rule-Based-Execution.svg new file mode 100644 index 000000000..bd6717043 --- /dev/null +++ b/g3doc/architecture_guide/Rule-Based-Execution.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/architecture_guide/Sentry-Gofer.png b/g3doc/architecture_guide/Sentry-Gofer.png new file mode 100644 index 000000000..ca2c27ef7 Binary files /dev/null and b/g3doc/architecture_guide/Sentry-Gofer.png differ diff --git a/g3doc/architecture_guide/Sentry-Gofer.svg b/g3doc/architecture_guide/Sentry-Gofer.svg new file mode 100644 index 000000000..5c10750d2 --- /dev/null +++ b/g3doc/architecture_guide/Sentry-Gofer.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/architecture_guide/performance.md b/g3doc/architecture_guide/performance.md new file mode 100644 index 000000000..fd219be5e --- /dev/null +++ b/g3doc/architecture_guide/performance.md @@ -0,0 +1,252 @@ +# Performance Guide + +gVisor is designed to provide a secure, virtualized environment while preserving +key benefits of containerization, such as small fixed overheads and a dynamic +resource footprint. For containerized infrastructure, this can provide a +turn-key solution for sandboxing untrusted workloads: there are no changes to +the fundamental resource model. + +gVisor imposes runtime costs over native containers. These costs come in two +forms: additional cycles and memory usage, which may manifest as increased +latency, reduced throughput or density, or not at all. In general, these costs +come from two different sources. + +First, the existence of the [Sentry](../) means that additional memory will be +required, and application system calls must traverse additional layers of +software. The design emphasizes [security](../security/) and therefore we chose +to use a language for the Sentry that provides benefits in this domain but may +not yet offer the raw performance of other choices. Costs imposed by these +design choices are **structural costs**. + +Second, as gVisor is an independent implementation of the system call surface, +many of the subsystems or specific calls are not as optimized as more mature +implementations. A good example here is the network stack, which is continuing +to evolve but does not support all the advanced recovery mechanisms offered by +other stacks and is less CPU efficient. This is an **implementation cost** and +is distinct from **structural costs**. Improvements here are ongoing and driven +by the workloads that matter to gVisor users and contributors. + +This page provides a guide for understanding baseline performance, and calls out +distint **structural costs** and **implementation costs**, highlighting where +improvements are possible and not possible. + +While we include a variety of workloads here, it’s worth emphasizing that gVisor +may not be an appropriate solution for every workload, for reasons other than +performance. For example, a sandbox may provide minimal benefit for a trusted +database, since _user data would already be inside the sandbox_ and there is no +need for an attacker to break out in the first place. + +## Methodology + +All data below was generated using the [benchmark tools][benchmark-tools] +repository, and the machines under test are uniform [Google Compute Engine][gce] +Virtual Machines (VMs) with the following specifications: + + Machine type: n1-standard-4 (broadwell) + Image: Debian GNU/Linux 9 (stretch) 4.19.0-0 + BootDisk: 2048GB SSD persistent disk + +Through this document, `runsc` is used to indicate the runtime provided by +gVisor. When relevant, we use the name `runsc-platform` to describe a specific +[platform choice](../platforms/). + +**Except where specified, all tests below are conducted with the `ptrace` +platform. The `ptrace` platform works everywhere and does not require hardware +virtualization or kernel modifications but suffers from the highest structural +costs by far. This platform is used to provide a clear understanding of the +performance model, but in no way represents an ideal scenario. In the future, +this guide will be extended to bare metal environments and include additional +platforms.** + +## Memory access + +gVisor does not introduce any additional costs with respect to raw memory +accesses. Page faults and other Operating System (OS) mechanisms are translated +through the Sentry, but once mappings are installed and available to the +application, there is no additional overhead. + +{% include graph.html id="sysbench-memory" url="/performance/sysbench-memory.csv" title="perf.py sysbench.memory --runtime=runc --runtime=runsc" %} + +The above figure demonstrates the memory transfer rate as measured by +`sysbench`. + +## Memory usage + +The Sentry provides an additional layer of indirection, and it requires memory +in order to store state associated with the application. This memory generally +consists of a fixed component, plus an amount that varies with the usage of +operating system resources (e.g. how many sockets or files are opened). + +For many use cases, fixed memory overheads are a primary concern. This may be +because sandboxed containers handle a low volume of requests, and it is +therefore important to achieve high densities for efficiency. + +{% include graph.html id="density" url="/performance/density.csv" title="perf.py density --runtime=runc --runtime=runsc" log="true" y_min="100000" %} + +The above figure demonstrates these costs based on three sample applications. +This test is the result of running many instances of a container (50, or 5 in +the case of redis) and calculating available memory on the host before and +afterwards, and dividing the difference by the number of containers. This +technique is used for measuring memory usage over the `usage_in_bytes` value of +the container cgroup because we found that some container runtimes, other than +`runc` and `runsc`, do not use an individual container cgroup. + +The first application is an instance of `sleep`: a trivial application that does +nothing. The second application is a synthetic `node` application which imports +a number of modules and listens for requests. The third application is a similar +synthetic `ruby` application which does the same. Finally, we include an +instance of `redis` storing approximately 1GB of data. In all cases, the sandbox +itself is responsible for a small, mostly fixed amount of memory overhead. + +## CPU performance + +gVisor does not perform emulation or otherwise interfere with the raw execution +of CPU instructions by the application. Therefore, there is no runtime cost +imposed for CPU operations. + +{% include graph.html id="sysbench-cpu" url="/performance/sysbench-cpu.csv" title="perf.py sysbench.cpu --runtime=runc --runtime=runsc" %} + +The above figure demonstrates the `sysbench` measurement of CPU events per +second. Events per second is based on a CPU-bound loop that calculates all prime +numbers in a specified range. We note that `runsc` does not impose a performance +penalty, as the code is executing natively in both cases. + +This has important consequences for classes of workloads that are often +CPU-bound, such as data processing or machine learning. In these cases, `runsc` +will similarly impose minimal runtime overhead. + +{% include graph.html id="tensorflow" url="/performance/tensorflow.csv" title="perf.py tensorflow --runtime=runc --runtime=runsc" %} + +For example, the above figure shows a sample TensorFlow workload, the +[convolutional neural network example][cnn]. The time indicated includes the +full start-up and run time for the workload, which trains a model. + +## System calls + +Some **structural costs** of gVisor are heavily influenced by the [platform +choice](../platforms/), which implements system call interception. Today, gVisor +supports a variety of platforms. These platforms present distinct performance, +compatibility and security trade-offs. For example, the KVM platform has low +overhead system call interception but runs poorly with nested virtualization. + +{% include graph.html id="syscall" url="/performance/syscall.csv" title="perf.py syscall --runtime=runc --runtime=runsc-ptrace --runtime=runsc-kvm" y_min="100" log="true" %} + +The above figure demonstrates the time required for a raw system call on various +platforms. The test is implemented by a custom binary which performs a large +number of system calls and calculates the average time required. + +This cost will principally impact applications that are system call bound, which +tend to be high-performance data stores and static network services. In general, +the impact of system call interception will be lower the more work an +application does. + +{% include graph.html id="redis" url="/performance/redis.csv" title="perf.py redis --runtime=runc --runtime=runsc" %} + +For example, `redis` is an application that performs relatively little work in +userspace: in general it reads from a connected socket, reads or modifies some +data, and writes a result back to the socket. The above figure shows the results +of running [comprehensive set of benchmarks][redis-benchmark]. We can see that +small operations impose a large overhead, while larger operations, such as +`LRANGE`, where more work is done in the application, have a smaller relative +overhead. + +Some of these costs above are **structural costs**, and `redis` is likely to +remain a challenging performance scenario. However, optimizing the +[platform](../platforms/) will also have a dramatic impact. + +## Start-up time + +For many use cases, the ability to spin-up containers quickly and efficiently is +important. A sandbox may be short-lived and perform minimal user work (e.g. a +function invocation). + +{% include graph.html id="startup" url="/performance/startup.csv" title="perf.py startup --runtime=runc --runtime=runsc" %} + +The above figure indicates how total time required to start a container through +[Docker][docker]. This benchmark uses three different applications. First, an +alpine Linux-container that executes `true`. Second, a `node` application that +loads a number of modules and binds an HTTP server. The time is measured by a +successful request to the bound port. Finally, a `ruby` application that +similarly loads a number of modules and binds an HTTP server. + +> Note: most of the time overhead above is associated Docker itself. This is +> evident with the empty `runc` benchmark. To avoid these costs with `runsc`, +> you may also consider using `runsc do` mode or invoking the [OCI +> runtime](../../user_guide/quick_start/oci/) directly. + +## Network + +Networking is mostly bound by **implementation costs**, and gVisor's network stack +is improving quickly. + +While typically not an important metric in practice for common sandbox use +cases, nevertheless `iperf` is a common microbenchmark used to measure raw +throughput. + +{% include graph.html id="iperf" url="/performance/iperf.csv" title="perf.py iperf --runtime=runc --runtime=runsc" %} + +The above figure shows the result of an `iperf` test between two instances. For +the upload case, the specified runtime is used for the `iperf` client, and in +the download case, the specified runtime is the server. A native runtime is +always used for the other endpoint in the test. + +{% include graph.html id="applications" metric="requests_per_second" url="/performance/applications.csv" title="perf.py http.(node|ruby) --connections=25 --runtime=runc --runtime=runsc" %} + +The above figure shows the result of simple `node` and `ruby` web services that +render a template upon receiving a request. Because these synthetic benchmarks +do minimal work per request, must like the `redis` case, they suffer from high +overheads. In practice, the more work an application does the smaller the impact +of **structural costs** become. + +## File system + +Some aspects of file system performance are also reflective of **implementation +costs**, and an area where gVisor's implementation is improving quickly. + +In terms of raw disk I/O, gVisor does not introduce significant fundamental +overhead. For general file operations, gVisor introduces a small fixed overhead +for data that transitions across the sandbox boundary. This manifests as +**structural costs** in some cases, since these operations must be routed +through the [Gofer](../) as a result of our [security model](../security/), but +in most cases are dominated by **implementation costs**, due to an internal +[Virtual File System][vfs] (VFS) implementation that needs improvement. + +{% include graph.html id="fio-bw" url="/performance/fio.csv" title="perf.py fio --engine=sync --runtime=runc --runtime=runsc" log="true" %} + +The above figures demonstrate the results of `fio` for reads and writes to and +from the disk. In this case, the disk quickly becomes the bottleneck and +dominates other costs. + +{% include graph.html id="fio-tmpfs-bw" url="/performance/fio-tmpfs.csv" title="perf.py fio --engine=sync --runtime=runc --tmpfs=True --runtime=runsc" log="true" %} + +The above figure shows the raw I/O performance of using a `tmpfs` mount which is +sandbox-internal in the case of `runsc`. Generally these operations are +similarly bound to the cost of copying around data in-memory, and we don't see +the cost of VFS operations. + +{% include graph.html id="httpd100k" metric="transfer_rate" url="/performance/httpd100k.csv" title="perf.py http.httpd --connections=1 --connections=5 --connections=10 --connections=25 --runtime=runc --runtime=runsc" %} + +The high costs of VFS operations can manifest in benchmarks that execute many +such operations in the hot path for serving requests, for example. The above +figure shows the result of using gVisor to serve small pieces of static content +with predictably poor results. This workload represents `apache` serving a +single file sized 100k from the container image to a client running +[ApacheBench][ab] with varying levels of concurrency. The high overhead comes +principally from the VFS implementation that needs improvement, with several +internal serialization points (since all requests are reading the same file). +Note that some of some of network stack performance issues also impact this +benchmark. + +{% include graph.html id="ffmpeg" url="/performance/ffmpeg.csv" title="perf.py media.ffmpeg --runtime=runc --runtime=runsc" %} + +For benchmarks that are bound by raw disk I/O and a mix of compute, file system +operations are less of an issue. The above figure shows the total time required +for an `ffmpeg` container to start, load and transcode a 27MB input video. + +[ab]: https://en.wikipedia.org/wiki/ApacheBench +[benchmark-tools]: https://github.com/google/gvisor/tree/master/benchmarks +[gce]: https://cloud.google.com/compute/ +[cnn]: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/convolutional_network.py +[docker]: https://docker.io +[redis-benchmark]: https://redis.io/topics/benchmarks +[vfs]: https://en.wikipedia.org/wiki/Virtual_file_system diff --git a/g3doc/architecture_guide/platforms.md b/g3doc/architecture_guide/platforms.md new file mode 100644 index 000000000..1f79971d1 --- /dev/null +++ b/g3doc/architecture_guide/platforms.md @@ -0,0 +1,86 @@ +# Platform Guide + +A gVisor sandbox consists of multiple processes when running. These processes +collectively comprise a shared environment in which one or more containers can +be run. + +Each sandbox has its own isolated instance of: + +* The **Sentry**, A user-space kernel that runs the container and intercepts + and responds to system calls made by the application. + +Each container running in the sandbox has its own isolated instance of: + +* A **Gofer** which provides file system access to the container. + +![gVisor architecture diagram](Sentry-Gofer.png "gVisor architecture diagram") + +## runsc + +The entrypoint to running a sandboxed container is the `runsc` executable. +`runsc` implements the [Open Container Initiative (OCI)][oci] runtime +specification. This means that OCI compatible _filesystem bundles_ can be run by +`runsc`. Filesystem bundles are comprised of a `config.json` file containing +container configuration, and a root filesystem for the container. Please see +the [OCI runtime spec][runtime-spec] for more information on filesystem bundles. +`runsc` implements multiple commands that perform various functions such as +starting, stopping, listing, and querying the status of containers. + +## Sentry + +The Sentry is the largest component of gVisor. It can be thought of as a +userspace OS kernel. The Sentry implements all the kernel functionality needed +by the untrusted application. It implements all of the supported system calls, +signal delivery, memory management and page faulting logic, the threading +model, and more. + +When the untrusted application makes a system call, the currently used platform +redirects the call to the Sentry, which will do the necessary work to service +it. It is important to note that the Sentry will not simply pass through system +calls to the host kernel. As a userspace application, the Sentry will make some +host system calls to support its operation, but it will not allow the +application to directly control the system calls it makes. + +The Sentry aims to present an equivalent environment to (upstream) Linux v4.4. + +File system operations that extend beyond the sandbox (not internal /proc +files, pipes, etc) are sent to the Gofer, described below. + +## Platforms + +gVisor requires a platform to implement interception of syscalls, basic context +switching, and memory mapping functionality. + +### ptrace + +The ptrace platform uses `PTRACE_SYSEMU` to execute user code without allowing +it to execute host system calls. This platform can run anywhere that ptrace +works (even VMs without nested virtualization). + +### KVM (experimental) + +The KVM platform allows the Sentry to act as both guest OS and VMM, switching +back and forth between the two worlds seamlessly. The KVM platform can run on +bare-metal or in a VM with nested virtualization enabled. While there is no +virtualized hardware layer -- the sandbox retains a process model -- gVisor +leverages virtualization extensions available on modern processors in order to +improve isolation and performance of address space switches. + +## Gofer + +The Gofer is a normal host Linux process. The Gofer is started with each sandbox +and connected to the Sentry. The Sentry process is started in a restricted +seccomp container without access to file system resources. The Gofer provides +the Sentry access to file system resources via the 9P protocol and provides an +additional level of isolation. + +## Application + +The application (aka the untrusted application) is a normal Linux binary +provided to gVisor in an OCI runtime bundle. gVisor aims to provide an +environment equivalent to Linux v4.4, so applications should be able to run +unmodified. However, gVisor does not presently implement every system call, +/proc file, or /sys file so some incompatibilities may occur. + +[oci]: https://www.opencontainers.org +[runtime-spec]: https://github.com/opencontainers/runtime-spec diff --git a/g3doc/architecture_guide/resources.md b/g3doc/architecture_guide/resources.md new file mode 100644 index 000000000..7e45b58a9 --- /dev/null +++ b/g3doc/architecture_guide/resources.md @@ -0,0 +1 @@ +# Resource Model diff --git a/g3doc/architecture_guide/security.md b/g3doc/architecture_guide/security.md new file mode 100644 index 000000000..59003f0a8 --- /dev/null +++ b/g3doc/architecture_guide/security.md @@ -0,0 +1,251 @@ +# Security Model + +gVisor was created in order to provide additional defense against the +exploitation of kernel bugs by untrusted userspace code. In order to understand +how gVisor achieves this goal, it is first necessary to understand the basic +threat model. + +## Threats: The Anatomy of an Exploit + +An exploit takes advantage of a software or hardware bug in order to escalate +privileges, gain access to privileged data, or disrupt services. All of the +possible interactions that a malicious application can have with the rest of the +system (attack vectors) define the attack surface. We categorize these attack +vectors into several common classes. + +### System API + +An operating system or hypervisor exposes an abstract System API in the form of +system calls and traps. This API may be documented and stable, as with Linux, or +it may be abstracted behind a library, as with Windows (i.e. win32.dll or +ntdll.dll). The System API includes all standard interfaces that application +code uses to interact with the system. This includes high-level abstractions +that are derived from low-level system calls, such as system files, sockets and +namespaces. + +Although the System API is exposed to applications by design, bugs and race +conditions within the kernel or hypervisor may occasionally be exploitable via +the API. This is common in part due to the fact that most kernels and hypervisors +are written in [C][clang], which is well-suited to interfacing with hardware but +often prone to security issues. In order to exploit these issues, a typical attack +might involve some combination of the following: + +1. Opening or creating some combination of files, sockets or other descriptors. +1. Passing crafted, malicious arguments, structures or packets. +1. Racing with multiple threads in order to hit specific code paths. + +For example, for the [Dirty Cow][dirtycow] privilege escalation bug, an +application would open a specific file in `/proc` or use a specific `ptrace` +system call, and use multiple threads in order to trigger a race condition when +touching a fresh page of memory. The attacker then gains control over a page of +memory belonging to the system. With additional privileges or access to +privileged data in the kernel, an attacker will often be able to employ +additional techniques to gain full access to the rest of the system. + +While bugs in the implementation of the System API are readily fixed, they are +also the most common form of exploit. The exposure created by this class of +exploit is what gVisor aims to minimize and control, described in detail below. + +### System ABI + +Hardware and software exploits occasionally exist in execution paths that are +not part of an intended System API. In this case, exploits may be found as part +of implicit actions the hardware or privileged system code takes in response to +certain events, such as traps or interrupts. For example, the recent +[POPSS][popss] flaw required only native code execution (no specific system call +or file access). In that case, the Xen hypervisor was similarly vulnerable, +highlighting that hypervisors are not immune to this vector. + +### Side Channels + +Hardware side channels may be exploitable by any code running on a system: +native, sandboxed, or virtualized. However, many host-level mitigations against +hardware side channels are still effective with a sandbox. For example, kernels +built with retpoline protect against some speculative execution attacks +(Spectre) and frame poisoning may protect against L1 terminal fault (L1TF) +attacks. Hypervisors may introduce additional complications in this regard, as +there is no mitigation against an application in a normally functioning Virtual +Machine (VM) exploiting the L1TF vulnerability for another VM on the sibling +hyperthread. + +### Other Vectors + +The above categories in no way represent an exhaustive list of exploits, as we +focus only on running untrusted code from within the operating system or +hypervisor. We do not consider other ways that a more generic adversary +may interact with a system, such as inserting a portable storage device with a +malicious filesystem image, using a combination of crafted keyboard or touch +inputs, or saturating a network device with ill-formed packets. + +Furthermore, high-level systems may contain exploitable components. An attacker +need not escalate privileges within a container if there’s an exploitable +network-accessible service on the host or some other API path. *A sandbox is not +a substitute for a secure architecture*. + +## Goals: Limiting Exposure + +gVisor’s primary design goal is to minimize the System API attack vector while +still providing a process model. There are two primary security principles that +inform this design. First, the application’s direct interactions with the host +System API are intercepted by the Sentry, which implements the System API +instead. Second, the System API accessible to the Sentry itself is minimized to +a safer, restricted set. The first principle minimizes the possibility of direct +exploitation of the host System API by applications, and the second principle +minimizes indirect exploitability, which is the exploitation by an exploited or +buggy Sentry (e.g. chaining an exploit). + +The first principle is similar to the security basis for a Virtual Machine (VM). +With a VM, an application’s interactions with the host are replaced by +interactions with a guest operating system and a set of virtualized hardware +devices. These hardware devices are then implemented via the host System API by +a Virtual Machine Monitor (VMM). The Sentry similarly prevents direct interactions +by providing its own implementation of the System API that the application +must interact with. Applications are not able to to directly craft specific +arguments or flags for the host System API, or interact directly with host +primitives. + +For both the Sentry and a VMM, it’s worth noting that while direct interactions +are not possible, indirect interactions are still possible. For example, a read +on a host-backed file in the Sentry may ultimately result in a host read system +call (made by the Sentry, not by passing through arguments from the application), +similar to how a read on a block device in a VM may result in the VMM issuing +a corresponding host read system call from a backing file. + +An important distinction from a VM is that the Sentry implements a System API based +directly on host System API primitives instead of relying on virtualized hardware +and a guest operating system. This selects a distinct set of trade-offs, largely +in the performance, efficiency and compatibility domains. Since transitions in +and out of the sandbox are relatively expensive, a guest operating system will +typically take ownership of resources. For example, in the above case, the +guest operating system may read the block device data in a local page cache, +to avoid subsequent reads. This may lead to better performance but lower +efficiency, since memory may be wasted or duplicated. The Sentry opts instead +to defer to the host for many operations during runtime, for improved efficiency +but lower performance in some use cases. + +### What can a sandbox do? + +An application in a gVisor sandbox is permitted to do most things a standard +container can do: for example, applications can read and write files mapped +within the container, make network connections, etc. As described above, +gVisor's primary goal is to limit exposure to bugs and exploits while still +allowing most applications to run. Even so, gVisor will limit some operations +that might be permitted with a standard container. Even with appropriate +capabilities, a user in a gVisor sandbox will only be able to manipulate +virtualized system resources (e.g. the system time, kernel settings or +filesystem attributes) and not underlying host system resources. + +While the sandbox virtualizes many operations for the application, we limit the +sandbox's own interactions with the host to the following high-level operations: + +1. Communicate with a Gofer process via a connected socket. The sandbox may + receive new file descriptors from the Gofer process, corresponding to opened + files. These files can then be read from and written to by the sandbox. +1. Make a minimal set of host system calls. The calls do not include the + creation of new sockets (unless host networking mode is enabled) or opening + files. The calls include duplication and closing of file descriptors, + synchronization, timers and signal management. +1. Read and write packets to a virtual ethernet device. This is not required if + host networking is enabled (or networking is disabled). + +### System ABI, Side Channels and Other Vectors + +gVisor relies on the host operating system and the platform for defense against +hardware-based attacks. Given the nature of these vulnerabilities, there is +little defense that gVisor can provide (there’s no guarantee that additional +hardware measures, such as virtualization, memory encryption, etc. would +actually decrease the attack surface). Note that this is true even when using +hardware virtualization for acceleration, as the host kernel or hypervisor is +ultimately responsible for defending against attacks from within malicious +guests. + +gVisor similarly relies on the host resource mechanisms (cgroups) for defense +against resource exhaustion and denial of service attacks. Network policy +controls should be applied at the container level to ensure appropriate network +policy enforcement. Note that the sandbox itself is not capable of altering or +configuring these mechanisms, and the sandbox itself should make an attacker +less likely to exploit or override these controls through other means. + +## Principles: Defense-in-Depth + +For gVisor development, there are several engineering principles that are +employed in order to ensure that the system meets its design goals. + +1. No system call is passed through directly to the host. Every supported call + has an independent implementation in the Sentry, that is unlikely to suffer + from identical vulnerabilities that may appear in the host. This has the + consequence that all kernel features used by applications require an + implementation within the Sentry. +1. Only common, universal functionality is implemented. Some filesystems, + network devices or modules may expose specialized functionality to user + space applications via mechanisms such as extended attributes, raw sockets + or ioctls. Since the Sentry is responsible for implementing the full system + call surface, we do not implement or pass through these specialized APIs. +1. The host surface exposed to the Sentry is minimized. While the system call + surface is not trivial, it is explicitly enumerated and controlled. The + Sentry is not permitted to open new files, create new sockets or do many + other interesting things on the host. + +Additionally, we have practical restrictions that are imposed on the project to +minimize the risk of Sentry exploitability. For example: + +1. Unsafe code is carefully controlled. All unsafe code is isolated in files + that end with "unsafe.go", in order to facilitate validation and auditing. + No file without the unsafe suffix may import the unsafe package. +1. No CGo is allowed. The Sentry must be a pure Go binary. +1. External imports are not generally allowed within the core packages. Only + limited external imports are used within the setup code. The code available + inside the Sentry is carefully controlled, to ensure that the above rules + are effective. + +Finally, we recognize that security is a process, and that vigilance is +critical. Beyond our security disclosure process, the Sentry is fuzzed +continuously to identify potential bugs and races proactively, and production +crashes are recorded and triaged to similarly identify material issues. + +## FAQ + +### Is this more or less secure than a Virtual Machine? + +The security of a VM depends to a large extent on what is exposed from the host +kernel and user space support code. For example, device emulation code in the +host kernel (e.g. APIC) or optimizations (e.g. vhost) can be more complex than a +simple system call, and exploits carry the same risks. Similarly, the user space +support code is frequently unsandboxed, and exploits, while rare, may allow +unfettered access to the system. + +Some platforms leverage the same virtualization hardware as VMs in order to +provide better system call interception performance. However, gVisor does not +implement any device emulation, and instead opts to use a sandboxed host System +API directly. Both approaches significantly reduce the original attack surface. +Ultimately, since gVisor is capable of using the same hardware mechanism, one +should not assume that the mere use of virtualization hardware makes a system +more or less secure, just as it would be a mistake to make the claim that the +use of a unibody alone makes a car safe. + +### Does this stop hardware side channels? + +In general, gVisor does not provide protection against hardware side channels, +although it may make exploits that rely on direct access to the host System API +more difficult to use. To minimize exposure, you should follow relevant guidance +from vendors and keep your host kernel and firmware up-to-date. + +### Is this just a ptrace sandbox? + +No: the term “ptrace sandbox” generally refers to software that uses the Linux +ptrace facility to inspect and authorize system calls made by applications, +enforcing a specific policy. These commonly suffer from two issues. First, +vulnerable system calls may be authorized by the sandbox, as the application +still has direct access to some System API. Second, it’s impossible to avoid +time-of-check, time-of-use race conditions without disabling multi-threading. + +In gVisor, the platforms that use ptrace operate differently. The stubs that are +traced are never allowed to continue execution into the host kernel and complete +a call directly. Instead, all system calls are interpreted and handled by the +Sentry itself, who reflects resulting register state back into the tracee before +continuing execution in user space. This is very similar to the mechanism used +by User-Mode Linux (UML). + +[dirtycow]: https://en.wikipedia.org/wiki/Dirty_COW +[clang]: https://en.wikipedia.org/wiki/C_(programming_language) +[popss]: https://nvd.nist.gov/vuln/detail/CVE-2018-8897 diff --git a/g3doc/community.md b/g3doc/community.md new file mode 100644 index 000000000..b3dc2d2cf --- /dev/null +++ b/g3doc/community.md @@ -0,0 +1,31 @@ +# Participation + +To contribute code, please read the [contributing guide](../CONTRIBUTING.md). + +Please note that the [Code of Conduct](../CODE_OF_CONDUCT.md) applies to +community forums as well as technical participation. + +## Communication channels + +The project maintains two mailing lists: + +* [gvisor-users][gvisor-users] for accouncements and general discussion. +* [gvisor-dev][gvisor-dev] for development and contribution. + +We also have a [chat room hosted on Gitter][gitter-chat]. + +We'd love to hear from you! + +## Community meetings + +The community calendar shows upcoming public meetings and opportunities to +collaborate or discuss the project. Meetings are planned and announced ahead of +time via the [gvisor-users][gvisor-users] mailing list. + +These meetings are public: anyone can join. + + + +[gitter-chat]: https://gitter.im/gvisor/community +[gvisor-dev]: https://groups.google.com/forum/#!forum/gvisor-dev +[gvisor-users]: https://groups.google.com/forum/#!forum/gvisor-users diff --git a/g3doc/logo.txt b/g3doc/logo.txt new file mode 100644 index 000000000..92f9cad5f --- /dev/null +++ b/g3doc/logo.txt @@ -0,0 +1 @@ +The gVisor logo files are licensed under CC BY-SA 4.0 (Creative Commons Attribution-ShareAlike 4.0 International). diff --git a/g3doc/roadmap.md b/g3doc/roadmap.md new file mode 100644 index 000000000..86bb11c3b --- /dev/null +++ b/g3doc/roadmap.md @@ -0,0 +1,48 @@ +# Roadmap + +gVisor [GitHub Issues][issues] serve as the source-of-truth for most work in +flight. Specific performance and compatibility issues are generally tracked +there. [GitHub Milestones][milestones] may be used to track larger features that +span many issues. However, labels are also used to aggregate cross-cutting +feature work. + +## Core Improvements + +Most gVisor work is focused on four areas. + +* [Performance][performance]: overall sandbox performance, including platform + performance, is a critical area for investment. This includes: network + performance (throughput and latency), file system performance (metadata and + data I/O), application switch and fault costs, etc. The goal of gVisor is to + provide sandboxing without a material performance or efficiency impact on all + but the most performance-sensitive applications. + +* [Compatibility][compatibility]: supporting a wide range of applications + requires supporting a large system API, including special system files (e.g. + proc, sys, dev, etc.). The goal of gVisor is to support the broad set of + applications that depend on a generic Linux API, rather than a specific kernel + version. + +* [Infrastructure & tooling][infrastructure]: the above goals require aggressive + testing and coverage, and well-established processes. This includes adding + appropriate system call coverage, end-to-end suites and runtime tests. + +* [Integration][integration]: Container infrastructure is evolving rapidly and + becoming more complex, and gVisor must continuously implement relevant and + popular features to ensure that integration points remain robust and + feature-complete while preserving security guarantees. + +## Releases + +Releases are available on [GitHub][releases]. + +As a convenience, binary packages are also published. Instructions for their use +are available via the [Installation instructions](./user_guide/install.md). + +[issues]: https://github.com/google/gvisor/issues +[milestones]: https://github.com/google/gvisor/milestones +[releases]: https://github.com/google/gvisor/releases +[performance]: https://github.com/google/gvisor/issues?q=is%3Aopen+is%3Aissue+label%3A%22area%3A+performance%22 +[integration]: https://github.com/google/gvisor/issues?q=is%3Aopen+is%3Aissue+label%3A%22area%3A+integration%22 +[compatibility]: https://github.com/google/gvisor/issues?q=is%3Aopen+is%3Aissue+label%3A%22area%3A+compatibility%22 +[infrastructure]: https://github.com/google/gvisor/issues?q=is%3Aopen+is%3Aissue+label%3A%22area%3A+tooling%22 diff --git a/g3doc/user_guide/BUILD b/g3doc/user_guide/BUILD new file mode 100644 index 000000000..5568e1ba4 --- /dev/null +++ b/g3doc/user_guide/BUILD @@ -0,0 +1,70 @@ +load("//website:defs.bzl", "doc") + +package( + default_visibility = ["//website:__pkg__"], + licenses = ["notice"], +) + +doc( + name = "compatibility", + src = "compatibility.md", + category = "Compatibility", + permalink = "/docs/user_guide/compatibility/", + weight = "0", +) + +doc( + name = "checkpoint_restore", + src = "checkpoint_restore.md", + category = "User Guide", + permalink = "/docs/user_guide/checkpoint_restore/", + weight = "60", +) + +doc( + name = "debugging", + src = "debugging.md", + category = "User Guide", + permalink = "/docs/user_guide/debugging/", + weight = "70", +) + +doc( + name = "FAQ", + src = "FAQ.md", + category = "User Guide", + permalink = "/docs/user_guide/FAQ/", + weight = "90", +) + +doc( + name = "filesystem", + src = "filesystem.md", + category = "User Guide", + permalink = "/docs/user_guide/filesystem/", + weight = "40", +) + +doc( + name = "networking", + src = "networking.md", + category = "User Guide", + permalink = "/docs/user_guide/networking/", + weight = "50", +) + +doc( + name = "install", + src = "install.md", + category = "User Guide", + permalink = "/docs/user_guide/install/", + weight = "10", +) + +doc( + name = "platforms", + src = "platforms.md", + category = "User Guide", + permalink = "/docs/user_guide/platforms/", + weight = "30", +) diff --git a/g3doc/user_guide/FAQ.md b/g3doc/user_guide/FAQ.md new file mode 100644 index 000000000..a84ac3c48 --- /dev/null +++ b/g3doc/user_guide/FAQ.md @@ -0,0 +1,112 @@ +# FAQ + +### What operating systems are supported? {#supported-os} + +Today, gVisor requires Linux. + +### What CPU architectures are supported? {#supported-cpus} + +gVisor currently supports [x86_64/AMD64](https://en.wikipedia.org/wiki/X86-64) +compatible processors. + +### Do I need to modify my Linux application to use gVisor? {#modify-app} + +No. gVisor is capable of running unmodified Linux binaries. + +### What binary formats does gVisor support? {#supported-binaries} + +gVisor supports Linux +[ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) binaries. +Binaries run in gVisor should be built for the +[AMD64](https://en.wikipedia.org/wiki/X86-64) CPU architecture. + +### Can I run Docker images using gVisor? {#docker-images} + +Yes. Please see the [Docker Quick Start][docker]. + +### Can I run Kubernetes pods using gVisor? {#k8s-pods} + +Yes. Please see the [Kubernetes Quick Start][k8s]. + +### What's the security model? {#security-model} + +See the [Security Model][security-model]. + +## Troubleshooting + +### My container runs fine with `runc` but fails with `runsc` {#app-compatibility} + +If you’re having problems running a container with `runsc` it’s most likely due +to a compatibility issue or a missing feature in gVisor. See +[Debugging][debugging]. + +### When I run my container, docker fails with: `open /run/containerd/...//log.json: no such file or directory` {#memfd-create} + +You are using an older version of Linux which doesn't support `memfd_create`. + +This is tracked in [bug #268](https://gvisor.dev/issue/268). + +### When I run my container, docker fails with: `flag provided but not defined: -console` {#old-docker} + +You're using an old version of Docker. See [Docker Quick Start][docker]. + +### I can’t see a file copied with: `docker cp` {#fs-cache} + +For performance reasons, gVisor caches directory contents, and therefore it may +not realize a new file was copied to a given directory. To invalidate the cache +and force a refresh, create a file under the directory in question and list the +contents again. + +As a workaround, shared root filesystem can be enabled. See [Filesystem][filesystem]. + +This bug is tracked in [bug #4](https://gvisor.dev/issue/4). + +Note that `kubectl cp` works because it does the copy by exec'ing inside the +sandbox, and thus gVisor's internal cache is made aware of the new files and +directories. + +### I'm getting an error like: `panic: unable to attach: operation not permitted` or `fork/exec /proc/self/exe: invalid argument: unknown` {#runsc-perms} + +Make sure that permissions and the owner is correct on the `runsc` binary. + +```bash +sudo chown root:root /usr/local/bin/runsc +sudo chmod 0755 /usr/local/bin/runsc +``` + +### I'm getting an error like `mount submount "/etc/hostname": creating mount with source ".../hostname": input/output error: unknown.` {#memlock} + +There is a bug in Linux kernel versions 5.1 to 5.3.15, 5.4.2, and 5.5. Upgrade to a newer kernel or add the following to `/lib/systemd/system/containerd.service` as a workaround. + +``` +LimitMEMLOCK=infinity +``` + +And run `systemctl daemon-reload && systemctl restart containerd` to restart containerd. + +See [issue #1765](https://gvisor.dev/issue/1765) for more details. + +### My container cannot resolve another container's name when using Docker user defined bridge {#docker-bridge} + +This is normally indicated by errors like `bad address 'container-name'` when +trying to communicate to another container in the same network. + +Docker user defined bridge uses an embedded DNS server bound to the loopback +interface on address 127.0.0.10. This requires access to the host network in +order to communicate to the DNS server. runsc network is isolated from the +host and cannot access the DNS server on the host network without breaking the +sandbox isolation. There are a few different workarounds you can try: + +* Use default bridge network with `--link` to connect containers. Default + bridge doesn't use embedded DNS. +* Use [`--network=host`][host-net] option in runsc, however beware that it will + use the host network stack and is less secure. +* Use IPs instead of container names. +* Use [Kubernetes][k8s]. Container name lookup works fine in Kubernetes. + +[security-model]: /docs/architecture_guide/security/ +[host-net]: /docs/user_guide/networking/#network-passthrough +[debugging]: /docs/user_guide/debugging/ +[filesystem]: /docs/user_guide/filesystem/ +[docker]: /docs/user_guide/quick_start/docker/ +[k8s]: /docs/user_guide/quick_start/kubernetes/ diff --git a/g3doc/user_guide/checkpoint_restore.md b/g3doc/user_guide/checkpoint_restore.md new file mode 100644 index 000000000..1814a2799 --- /dev/null +++ b/g3doc/user_guide/checkpoint_restore.md @@ -0,0 +1,99 @@ +# Checkpoint/Restore + +gVisor has the ability to checkpoint a process, save its current state in a +state file, and restore into a new container using the state file. + +## How to use checkpoint/restore + +Checkpoint/restore functionality is currently available via raw `runsc` +commands. To use the checkpoint command, first run a container. + +```bash +runsc run +``` + +To checkpoint the container, the `--image-path` flag must be provided. This is +the directory path within which the checkpoint state-file will be created. The +file will be called `checkpoint.img` and necessary directories will be created +if they do not yet exist. + +> Note: Two checkpoints cannot be saved to the same directory; every image-path +> provided must be unique. + +```bash +runsc checkpoint --image-path= +``` + +There is also an optional `--leave-running` flag that allows the container to +continue to run after the checkpoint has been made. (By default, containers stop +their processes after committing a checkpoint.) + +> Note: All top-level runsc flags needed when calling run must be provided to +> checkpoint if --leave-running is used. + +> Note: --leave-running functions by causing an immediate restore so the +> container, although will maintain its given container id, may have a different +> process id. + +```bash +runsc checkpoint --image-path= --leave-running +``` + +To restore, provide the image path to the `checkpoint.img` file created during +the checkpoint. Because containers stop by default after checkpointing, restore +needs to happen in a new container (restore is a command which parallels start). + +```bash +runsc create + +runsc restore --image-path= +``` + +## How to use checkpoint/restore in Docker: + +Currently checkpoint/restore through `runsc` is not entirely compatible with +Docker, although there has been progress made from both gVisor and Docker to +enable compatibility. Here, we document the ideal workflow. + +Run a container: + +```bash +docker run [options] --runtime=runsc ` +``` + +Checkpoint a container: + +```bash +docker checkpoint create ` +``` + +Create a new container into which to restore: + +```bash +docker create [options] --runtime=runsc +``` + +Restore a container: + +```bash +docker start --checkpoint --checkpoint-dir= +``` + +### Issues Preventing Compatibility with Docker + +- **[Moby #37360][leave-running]:** Docker version 18.03.0-ce and earlier hangs + when checkpointing and does not create the checkpoint. To successfully use + this feature, install a custom version of docker-ce from the moby repository. + This issue is caused by an improper implementation of the `--leave-running` + flag. This issue is fixed in newer releases. +- **Docker does not support restoration into new containers:** Docker currently + expects the container which created the checkpoint to be the same container + used to restore which is not possible in runsc. When Docker supports container + migration and therefore restoration into new containers, this will be the + flow. +- **[Moby #37344][checkpoint-dir]:** Docker does not currently support the + `--checkpoint-dir` flag but this will be required when restoring from a + checkpoint made in another container. + +[leave-running]: https://github.com/moby/moby/pull/37360 +[checkpoint-dir]: https://github.com/moby/moby/issues/37344 diff --git a/g3doc/user_guide/compatibility.md b/g3doc/user_guide/compatibility.md new file mode 100644 index 000000000..5fe9fc1e8 --- /dev/null +++ b/g3doc/user_guide/compatibility.md @@ -0,0 +1,87 @@ +# Applications + +gVisor implements a large portion of the Linux surface and while we strive to +make it broadly compatible, there are (and always will be) unimplemented +features and bugs. The only real way to know if it will work is to try. If you +find a container that doesn’t work and there is no known issue, please [file a +bug][bug] indicating the full command you used to run the image. You can view +open issues related to compatibility [here][issues]. + +If you're able to provide the [debug logs](../debugging/), the +problem likely to be fixed much faster. + +## What works? + +The following applications/images have been tested: + +* elasticsearch +* golang +* httpd +* java8 +* jenkins +* mariadb +* memcached +* mongo +* mysql +* nginx +* node +* php +* postgres +* prometheus +* python +* redis +* registry +* tomcat +* wordpress + +## Utilities + +Most common utilities work. Note that: + +* Some tools, such as `tcpdump` and old versions of `ping`, require explicitly + enabling raw sockets via the unsafe `--net-raw` runsc flag. +* Different Docker images can behave differently. For example, Alpine Linux and + Ubuntu have different `ip` binaries. + + Specific tools include: + +| Tool | Status | +| --- | --- | +| apt-get | Working | +| bundle | Working | +| cat | Working | +| curl | Working | +| dd | Working | +| df | Working | +| dig | Working | +| drill | Working | +| env | Working | +| find | Working | +| gdb | Working | +| gosu | Working | +| grep | Working (unless stdin is a pipe and stdout is /dev/null) | +| ifconfig | Works partially, like ip. Full support [in progress](https://gvisor.dev/issue/578) | +| ip | Some subcommands work (e.g. addr, route). Full support [in progress](https://gvisor.dev/issue/578) | +| less | Working | +| ls | Working | +| lsof | Working | +| mount | Works in readonly mode. gVisor doesn't currently support creating new mounts at runtime | +| nc | Working | +| nmap | Not working | +| netstat | [In progress](https://gvisor.dev/issue/2112) | +| nslookup | Working | +| ping | Working | +| ps | Working | +| route | Working | +| ss | [In progress](https://gvisor.dev/issue/2114) | +| sshd | Partially working. Job control [in progress](https://gvisor.dev/issue/154) | +| strace | Working | +| tar | Working | +| tcpdump | [In progress](https://gvisor.dev/issue/173) | +| top | Working | +| uptime | Working | +| vim | Working | +| wget | Working | + +[bug]: https://github.com/google/gvisor/issues/new?title=Compatibility%20Issue: +[issues]: https://github.com/google/gvisor/issues?q=is%3Aissue+is%3Aopen+label%3A%22area%3A+compatibility%22 diff --git a/g3doc/user_guide/debugging.md b/g3doc/user_guide/debugging.md new file mode 100644 index 000000000..a7c3138d7 --- /dev/null +++ b/g3doc/user_guide/debugging.md @@ -0,0 +1,129 @@ +# Debugging + +To enable debug and system call logging, add the `runtimeArgs` below to your +[Docker](../quick_start/docker/) configuration (`/etc/docker/daemon.json`): + +```json +{ + "runtimes": { + "runsc": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--debug-log=/tmp/runsc/", + "--debug", + "--strace" + ] + } + } +} +``` + +> Note: the last `/` in `--debug-log` is needed to interpret it as a directory. +> Then each `runsc` command executed will create a separate log file. +> Otherwise, log messages from all commands will be appended to the same file. + +You may also want to pass `--log-packets` to troubleshoot network problems. Then +restart the Docker daemon: + +```bash +sudo systemctl restart docker +``` + +Run your container again, and inspect the files under `/tmp/runsc`. The log file +ending with `.boot` will contain the strace logs from your application, which can +be useful for identifying missing or broken system calls in gVisor. If you are +having problems starting the container, the log file ending with `.create` may +have the reason for the failure. + +## Stack traces + +The command `runsc debug --stacks` collects stack traces while the sandbox is +running which can be useful to troubleshoot issues or just to learn more about +gVisor. It connects to the sandbox process, collects a stack dump, and writes +it to the console. For example: + +```bash +docker run --runtime=runsc --rm -d alpine sh -c "while true; do echo running; sleep 1; done" +63254c6ab3a6989623fa1fb53616951eed31ac605a2637bb9ddba5d8d404b35b + +sudo runsc --root /var/run/docker/runtime-runsc/moby debug --stacks 63254c6ab3a6989623fa1fb53616951eed31ac605a2637bb9ddba5d8d404b35b +``` + +> Note: `--root` variable is provided by docker and is normally set to +> `/var/run/docker/runtime-[runtime-name]/moby`. If in doubt, `--root` is logged to +> `runsc` logs. + +## Debugger + +You can debug gVisor like any other Golang program. If you're running with Docker, +you'll need to find the sandbox PID and attach the debugger as root. Here is an +example: + +```bash +# Get a runsc with debug symbols (download nightly or build with symbols). +bazel build -c dbg //runsc:runsc + +# Start the container you want to debug. +docker run --runtime=runsc --rm --name=test -d alpine sleep 1000 + +# Find the sandbox PID. +docker inspect test | grep Pid | head -n 1 + +# Attach your favorite debugger. +sudo dlv attach + +# Set a breakpoint and resume. +break mm.MemoryManager.MMap +continue +``` + +## Profiling + +`runsc` integrates with Go profiling tools and gives you easy commands to profile +CPU and heap usage. First you need to enable `--profile` in the command line options +before starting the container: + +```json +{ + "runtimes": { + "runsc-prof": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--profile" + ] + } + } +} +``` + +> Note: Enabling profiling loosens the seccomp protection added to the sandbox, +> and should not be run in production under normal circumstances. + +Then restart docker to refresh the runtime options. While the container is running, +execute `runsc debug` to collect profile information and save to a file. Here are +the options available: + +* **--profile-heap:** Generates heap profile to the speficied file. +* **--profile-cpu:** Enables CPU profiler, waits for `--duration` seconds + and generates CPU profile to the speficied file. + +For example: + +```bash +docker run --runtime=runsc-prof --rm -d alpine sh -c "while true; do echo running; sleep 1; done" +63254c6ab3a6989623fa1fb53616951eed31ac605a2637bb9ddba5d8d404b35b + +sudo runsc --root /var/run/docker/runtime-runsc-prof/moby debug --profile-heap=/tmp/heap.prof 63254c6ab3a6989623fa1fb53616951eed31ac605a2637bb9ddba5d8d404b35b +sudo runsc --root /var/run/docker/runtime-runsc-prof/moby debug --profile-cpu=/tmp/cpu.prof --duration=30s 63254c6ab3a6989623fa1fb53616951eed31ac605a2637bb9ddba5d8d404b35b +``` + +The resulting files can be opened using `go tool pprof` or [pprof][]. The examples +below create image file (`.svg`) with the heap profile and writes the top +functions using CPU to the console: + +```bash +go tool pprof -svg /usr/local/bin/runsc /tmp/heap.prof +go tool pprof -top /usr/local/bin/runsc /tmp/cpu.prof +``` + +[pprof]: https://github.com/google/pprof/blob/master/doc/README.md diff --git a/g3doc/user_guide/filesystem.md b/g3doc/user_guide/filesystem.md new file mode 100644 index 000000000..13bc07ab1 --- /dev/null +++ b/g3doc/user_guide/filesystem.md @@ -0,0 +1,57 @@ +# Filesystem + +gVisor accesses the filesystem through a file proxy, called the Gofer. The gofer +runs as a separate process, that is isolated from the sandbox. Gofer instances +communicate with their respective sentry using the 9P protocol. For a more detailed +explanation see [Overview > Gofer](../../architecture_guide/#gofer). + +## Sandbox overlay + +To isolate the host filesystem from the sandbox, you can set a writable tmpfs overlay +on top of the entire filesystem. All modifications are made to the overlay, keeping +the host filesystem unmodified. + +> Note: All created and modified files are stored in memory inside the sandbox. + +To use the tmpfs overlay, add the following `runtimeArgs` to your Docker configuration +(`/etc/docker/daemon.json`) and restart the Docker daemon: + +```json +{ + "runtimes": { + "runsc": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--overlay" + ] + } + } +} +``` + +## Shared root filesystem + +The root filesystem is where the image is extracted and is not generally modified +from outside the sandbox. This allows for some optimizations, like skipping checks +to determine if a directory has changed since the last time it was cached, thus +missing updates that may have happened. If you need to `docker cp` files inside the +root filesystem, you may want to enable shared mode. Just be aware that file system +access will be slower due to the extra checks that are required. + +> Note: External mounts are always shared. + +To use set the root filesystem shared, add the following `runtimeArgs` to your Docker +configuration (`/etc/docker/daemon.json`) and restart the Docker daemon: + +```json +{ + "runtimes": { + "runsc": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--file-access=shared" + ] + } + } +} +``` diff --git a/g3doc/user_guide/install.md b/g3doc/user_guide/install.md new file mode 100644 index 000000000..28422612e --- /dev/null +++ b/g3doc/user_guide/install.md @@ -0,0 +1,158 @@ +# Installation + +-> Note: gVisor supports only x86\_64 and requires Linux 4.14.77+ +-> ([older Linux](./networking.md#gso)). + +## Versions + +The `runsc` binaries and repositories are available in multiple versions and +release channels. You should pick the version you'd like to install. For +experimentation, the nightly release is recommended. For production use, the +latest release is recommended. + +After selecting an appropriate release channel from the options below, proceed +to the preferred installation mechanism: manual or from an `apt` repository. + +### HEAD + +Binaries are available for every commit on the `master` branch, and are +available at the following URL: + + `https://storage.googleapis.com/gvisor/releases/master/latest/runsc` + +Checksums for the release binary are at: + + `https://storage.googleapis.com/gvisor/releases/master/latest/runsc.sha512` + +For `apt` installation, use the `master` as the `${DIST}` below. + +### Nightly + +Nightly releases are built most nights from the master branch, and are available +at the following URL: + + `https://storage.googleapis.com/gvisor/releases/nightly/latest/runsc` + +Checksums for the release binary are at: + + `https://storage.googleapis.com/gvisor/releases/nightly/latest/runsc.sha512` + +Specific nightly releases can be found at: + + `https://storage.googleapis.com/gvisor/releases/nightly/${yyyy-mm-dd}/runsc` + +Note that a release may not be available for every day. + +For `apt` installation, use the `nightly` as the `${DIST}` below. + +### Latest release + +The latest official release is available at the following URL: + + `https://storage.googleapis.com/gvisor/releases/release/latest` + +For `apt` installation, use the `release` as the `${DIST}` below. + +### Specific release + +A given release release is available at the following URL: + + `https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}` + +See the [releases][releases] page for information about specific releases. + +For `apt` installation of a specific release, which may include point updates, +use the date of the release, e.g. `${yyyymmdd}`, as the `${DIST}` below. + +> Note: only newer releases may be available as `apt` repositories. + +### Point release + +A given point release is available at the following URL: + + `https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}.${rc}` + +Note that `apt` installation of a specific point release is not supported. + +## Install from an `apt` repository + +First, appropriate dependencies must be installed to allow `apt` to install +packages via https: + +```bash +sudo apt-get update && \ +sudo apt-get install -y \ + apt-transport-https \ + ca-certificates \ + curl \ + gnupg-agent \ + software-properties-common +``` + +Next, the key used to sign archives should be added to your `apt` keychain: + +```bash +curl -fsSL https://gvisor.dev/archive.key | sudo apt-key add - +``` + +Based on the release type, you will need to substitute `${DIST}` below, using +one of: + +* `master`: For HEAD. +* `nightly`: For nightly releases. +* `release`: For the latest release. +* `${yyyymmdd}`: For a specific releases (see above). + +The repository for the release you wish to install should be added: + +```bash +sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases ${DIST} main" +``` + +For example, to install the latest official release, you can use: + +```bash +sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main" +``` + +Now the runsc package can be installed: + +```bash +sudo apt-get update && sudo apt-get install -y runsc +``` + +If you have Docker installed, it will be automatically configured. + +## Install directly + +The binary URLs provided above can be used to install directly. For example, the +latest nightly binary can be downloaded, validated, and placed in an appropriate +location by running: + +```bash +( + set -e + URL=https://storage.googleapis.com/gvisor/releases/nightly/latest + wget ${URL}/runsc + wget ${URL}/runsc.sha512 + sha512sum -c runsc.sha512 + rm -f runsc.sha512 + sudo mv runsc /usr/local/bin + sudo chown root:root /usr/local/bin/runsc + sudo chmod 0755 /usr/local/bin/runsc +) +``` + +**It is important to copy this binary to a location that is accessible to all +users, and ensure it is executable by all users**, since `runsc` executes itself +as user `nobody` to avoid unnecessary privileges. The `/usr/local/bin` directory +is a good place to put the `runsc` binary. + +After installation, the`runsc` binary comes with an `install` command that can +optionally automatically configure Docker: + +```bash +runsc install +``` + +[releases]: https://github.com/google/gvisor/releases diff --git a/g3doc/user_guide/networking.md b/g3doc/user_guide/networking.md new file mode 100644 index 000000000..26c76e8aa --- /dev/null +++ b/g3doc/user_guide/networking.md @@ -0,0 +1,83 @@ +# Networking + +gVisor implements its own network stack called [netstack][netstack]. All aspects +of the network stack are handled inside the Sentry — including TCP connection +state, control messages, and packet assembly — keeping it isolated from the host +network stack. Data link layer packets are written directly to the virtual +device inside the network namespace setup by Docker or Kubernetes. + +The IP address and routes configured for the device are transferred inside the +sandbox. The loopback device runs exclusively inside the sandbox and does not +use the host. You can inspect them by running: + +```bash +docker run --rm --runtime=runsc alpine ip addr +``` + +## Network passthrough + +For high-performance networking applications, you may choose to disable the user +space network stack and instead use the host network stack, including the loopback. +Note that this mode decreases the isolation to the host. + +Add the following `runtimeArgs` to your Docker configuration +(`/etc/docker/daemon.json`) and restart the Docker daemon: + +```json +{ + "runtimes": { + "runsc": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--network=host" + ] + } + } +} +``` + +## Disabling external networking + +To completely isolate the host and network from the sandbox, external +networking can be disabled. The sandbox will still contain a loopback provided +by netstack. + +Add the following `runtimeArgs` to your Docker configuration +(`/etc/docker/daemon.json`) and restart the Docker daemon: + +```json +{ + "runtimes": { + "runsc": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--network=none" + ] + } + } +} +``` + +### Disable GSO {#gso} + +If your Linux is older than 4.14.17, you can disable Generic Segmentation +Offload (GSO) to run with a kernel that is newer than 3.17. Add the +`--gso=false` flag to your Docker runtime configuration +(`/etc/docker/daemon.json`) and restart the Docker daemon: + +> Note: Network performance, especially for large payloads, will be greatly reduced. + +```json +{ + "runtimes": { + "runsc": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--gso=false" + ] + } + } +} +``` + +[netstack]: https://github.com/google/netstack diff --git a/g3doc/user_guide/platforms.md b/g3doc/user_guide/platforms.md new file mode 100644 index 000000000..fb48db34f --- /dev/null +++ b/g3doc/user_guide/platforms.md @@ -0,0 +1,116 @@ +# Platforms (KVM) + +This document will help you set up your system to use a different gVisor +platform. + +## What is a Platform? + +gVisor requires a *platform* to implement interception of syscalls, basic +context switching, and memory mapping functionality. These are described in +more depth in the [Platform Design](../../architecture_guide/platforms/). + +## Selecting a Platform + +The platform is selected by the `--platform` command line flag passed to +`runsc`. By default, the ptrace platform is selected. To select a different +platform, modify your Docker configuration (`/etc/docker/daemon.json`) to +pass this argument: + +```json +{ + "runtimes": { + "runsc": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--platform=kvm" + ] + } + } +} +``` + +You must restart the Docker daemon after making changes to this file, typically +this is done via `systemd`: + +```bash +sudo systemctl restart docker +``` + +## Example: Using the KVM Platform + +The KVM platform is currently experimental; however, it provides several +benefits over the default ptrace platform. + +### Prerequisites + +You will also to have KVM installed on your system. If you are running a Debian +based system like Debian or Ubuntu you can usually do this by installing the +`qemu-kvm` package. + +```bash +sudo apt-get install qemu-kvm +``` + +If you are using a virtual machine you will need to make sure that nested +virtualization is configured. Here are links to documents on how to set up +nested virtualization in several popular environments: + +* Google Cloud: [Enabling Nested Virtualization for VM Instances][nested-gcp] +* Microsoft Azure: [How to enable nested virtualization in an Azure VM][nested-azure] +* VirtualBox: [Nested Virtualization][nested-virtualbox] +* KVM: [Nested Guests][nested-kvm] + +***Note: nested virtualization will have poor performance and is historically a +cause of security issues (e.g. +[CVE-2018-12904](https://nvd.nist.gov/vuln/detail/CVE-2018-12904)). It is not +recommended for production.*** + +### Configuring Docker + +Per above, you will need to configure Docker to use `runsc` with the KVM +platform. You will remember from the Docker Quick Start that you configured +Docker to use `runsc` as the runtime. Docker allows you to add multiple +runtimes to the Docker configuration. + +Add a new entry for the KVM platform entry to your Docker configuration +(`/etc/docker/daemon.json`) in order to provide the `--platform=kvm` runtime +argument. + +In the end, the file should look something like: + +```json +{ + "runtimes": { + "runsc": { + "path": "/usr/local/bin/runsc" + }, + "runsc-kvm": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--platform=kvm" + ] + } + } +} +``` + +You must restart the Docker daemon after making changes to this file, typically +this is done via `systemd`: + +```bash +sudo systemctl restart docker +``` + +## Running a container + +Now run your container using the `runsc-kvm` runtime. This will run the +container using the KVM platform: + +```bash +docker run --runtime=runsc-kvm --rm hello-world +``` + +[nested-azure]: https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nested-virtualization +[nested-gcp]: https://cloud.google.com/compute/docs/instances/enable-nested-virtualization-vm-instances +[nested-virtualbox]: https://www.virtualbox.org/manual/UserManual.html#nested-virt +[nested-kvm]: https://www.linux-kvm.org/page/Nested_Guests diff --git a/g3doc/user_guide/quick_start/BUILD b/g3doc/user_guide/quick_start/BUILD new file mode 100644 index 000000000..63f17f9cb --- /dev/null +++ b/g3doc/user_guide/quick_start/BUILD @@ -0,0 +1,33 @@ +load("//website:defs.bzl", "doc") + +package( + default_visibility = ["//website:__pkg__"], + licenses = ["notice"], +) + +doc( + name = "docker", + src = "docker.md", + category = "User Guide", + permalink = "/docs/user_guide/quick_start/docker/", + subcategory = "Quick Start", + weight = "11", +) + +doc( + name = "oci", + src = "oci.md", + category = "User Guide", + permalink = "/docs/user_guide/quick_start/oci/", + subcategory = "Quick Start", + weight = "12", +) + +doc( + name = "kubernetes", + src = "kubernetes.md", + category = "User Guide", + permalink = "/docs/user_guide/quick_start/kubernetes/", + subcategory = "Quick Start", + weight = "13", +) diff --git a/g3doc/user_guide/quick_start/docker.md b/g3doc/user_guide/quick_start/docker.md new file mode 100644 index 000000000..7dfc3d4b7 --- /dev/null +++ b/g3doc/user_guide/quick_start/docker.md @@ -0,0 +1,91 @@ +# Docker + +> Note: This guide requires Docker version 17.09.0 or greater. Refer to the +> [Docker documentation][docker] for how to install it. + +This guide will help you quickly get started running Docker containers using +gVisor. + +First, follow the [Installation guide][install]. + +If you use the `apt` repository or the `automated` install, then you can skip +the next section and proceed straight to running a container. + +## Configuring Docker + +First you will need to configure Docker to use `runsc` by adding a runtime +entry to your Docker configuration (`/etc/docker/daemon.json`). You may have to +create this file if it does not exist. Also, some Docker versions also require +you to [specify the `storage-driver` field][storage-driver]. + +In the end, the file should look something like: + +```json +{ + "runtimes": { + "runsc": { + "path": "/usr/local/bin/runsc" + } + } +} +``` + +You must restart the Docker daemon after making changes to this file, typically +this is done via `systemd`: + +```bash +sudo systemctl restart docker +``` + +## Running a container + +Now run your container using the `runsc` runtime: + +```bash +docker run --runtime=runsc --rm hello-world +``` + +You can also run a terminal to explore the container. + +```bash +docker run --runtime=runsc --rm -it ubuntu /bin/bash +``` + +Many docker options are compatible with gVisor, try them out. Here is an example: + +```bash +docker run --runtime=runsc --rm --link backend:database -v ~/bin:/tools:ro -p 8080:80 --cpus=0.5 -it busybox telnet towel.blinkenlights.nl +``` + +## Verify the runtime + +You can verify that you are running in gVisor using the `dmesg` command. + +```text +$ docker run --runtime=runsc -it ubuntu dmesg +[ 0.000000] Starting gVisor... +[ 0.354495] Daemonizing children... +[ 0.564053] Constructing home... +[ 0.976710] Preparing for the zombie uprising... +[ 1.299083] Creating process schedule... +[ 1.479987] Committing treasure map to memory... +[ 1.704109] Searching for socket adapter... +[ 1.748935] Generating random numbers by fair dice roll... +[ 2.059747] Digging up root... +[ 2.259327] Checking naughty and nice process list... +[ 2.610538] Rewriting operating system in Javascript... +[ 2.613217] Ready! +``` + +Note that this is easily replicated by an attacker so applications should never +use `dmesg` to verify the runtime in a security sensitive context. + +Next, look at the different options available for gVisor: [platform][platforms], +[network][networking], [filesystem][filesystem]. + +[docker]: https://docs.docker.com/install/ +[storage-driver]: https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-storage-driver +[install]: /docs/user_guide/install/ +[filesystem]: /docs/user_guide/filesystem/ +[networking]: /docs/user_guide/networking/ +[platforms]: /docs/user_guide/platforms/ diff --git a/g3doc/user_guide/quick_start/kubernetes.md b/g3doc/user_guide/quick_start/kubernetes.md new file mode 100644 index 000000000..237b3c17f --- /dev/null +++ b/g3doc/user_guide/quick_start/kubernetes.md @@ -0,0 +1,36 @@ +# Kubernetes + +gVisor can be used to run Kubernetes pods and has several integration points +with Kubernetes. + +## Using Minikube + +gVisor can run sandboxed containers in a Kubernetes cluster with Minikube. +After the gVisor addon is enabled, pods with +`io.kubernetes.cri.untrusted-workload` set to true will execute with `runsc`. +Follow [these instructions][minikube] to enable gVisor addon. + +## Using Containerd + +You can also setup Kubernetes nodes to run pods in gvisor using the +[containerd][containerd] CRI runtime and the `gvisor-containerd-shim`. You can +use either the `io.kubernetes.cri.untrusted-workload` annotation or +[RuntimeClass][runtimeclass] to run Pods with `runsc`. You can find +instructions [here][gvisor-containerd-shim]. + +## Using GKE Sandbox + +[GKE Sandbox][gke-sandbox] is available in [Google Kubernetes Engine][gke]. You +just need to deploy a node pool with gVisor enabled in your cluster, and it will +run pods annotated with `runtimeClassName: gvisor` inside a gVisor sandbox for +you. [Here][wordpress-quick] is a quick example showing how to deploy a +WordPress site. You can view the full documentation [here][gke-sandbox-docs]. + +[containerd]: https://containerd.io/ +[minikube]: https://github.com/kubernetes/minikube/blob/master/deploy/addons/gvisor/README.md +[gke]: https://cloud.google.com/kubernetes-engine/ +[gke-sandbox]: https://cloud.google.com/kubernetes-engine/sandbox/ +[gke-sandbox-docs]: https://cloud.google.com/kubernetes-engine/docs/how-to/sandbox-pods +[gvisor-containerd-shim]: https://github.com/google/gvisor-containerd-shim +[runtimeclass]: https://kubernetes.io/docs/concepts/containers/runtime-class/ +[wordpress-quick]: /docs/tutorials/kubernetes/ diff --git a/g3doc/user_guide/quick_start/oci.md b/g3doc/user_guide/quick_start/oci.md new file mode 100644 index 000000000..271ed24ce --- /dev/null +++ b/g3doc/user_guide/quick_start/oci.md @@ -0,0 +1,44 @@ +# OCI + +This guide will quickly get you started running your first gVisor sandbox +container using the runtime directly with the default platform. + +First, follow the [Installation guide][install]. + +## Run an OCI compatible container + +Now we will create an [OCI][oci] container bundle to run our container. First we +will create a root directory for our bundle. + +```bash +mkdir bundle +cd bundle +``` + +Create a root file system for the container. We will use the Docker hello-world +image as the basis for our container. + +```bash +mkdir rootfs +docker export $(docker create hello-world) | tar -xf - -C rootfs +``` + +Next, create an specification file called `config.json` that contains our +container specification. We will update the default command it runs to `/hello` +in the `hello-world` container. + +```bash +runsc spec +sed -i 's;"sh";"/hello";' config.json +``` + +Finally run the container. + +```bash +sudo runsc run hello +``` + +Next try [using CNI to set up networking](../../../tutorials/cni/) or [running gVisor using Docker](../docker/). + +[oci]: https://opencontainers.org/ +[install]: /docs/user_guide/install diff --git a/g3doc/user_guide/tutorials/BUILD b/g3doc/user_guide/tutorials/BUILD new file mode 100644 index 000000000..caae98623 --- /dev/null +++ b/g3doc/user_guide/tutorials/BUILD @@ -0,0 +1,37 @@ +load("//website:defs.bzl", "doc") + +package( + default_visibility = ["//website:__pkg__"], + licenses = ["notice"], +) + +doc( + name = "docker", + src = "docker.md", + category = "User Guide", + permalink = "/docs/tutorials/docker/", + subcategory = "Tutorials", + weight = "21", +) + +doc( + name = "cni", + src = "cni.md", + category = "User Guide", + permalink = "/docs/tutorials/cni/", + subcategory = "Tutorials", + weight = "22", +) + +doc( + name = "kubernetes", + src = "kubernetes.md", + category = "User Guide", + data = [ + "add-node-pool.png", + "node-pool-button.png", + ], + permalink = "/docs/tutorials/kubernetes/", + subcategory = "Tutorials", + weight = "33", +) diff --git a/g3doc/user_guide/tutorials/add-node-pool.png b/g3doc/user_guide/tutorials/add-node-pool.png new file mode 100644 index 000000000..e4560359b Binary files /dev/null and b/g3doc/user_guide/tutorials/add-node-pool.png differ diff --git a/g3doc/user_guide/tutorials/cni.md b/g3doc/user_guide/tutorials/cni.md new file mode 100644 index 000000000..6546f2737 --- /dev/null +++ b/g3doc/user_guide/tutorials/cni.md @@ -0,0 +1,172 @@ +# Using CNI + +This tutorial will show you how to set up networking for a gVisor sandbox using +the [Container Networking Interface (CNI)](https://github.com/containernetworking/cni). + +## Install CNI Plugins + +First you will need to install the CNI plugins. CNI plugins are used to set up +a network namespace that `runsc` can use with the sandbox. + +Start by creating the directories for CNI plugin binaries: + +``` +sudo mkdir -p /opt/cni/bin +``` + +Download the CNI plugins: + +``` +wget https://github.com/containernetworking/plugins/releases/download/v0.8.3/cni-plugins-linux-amd64-v0.8.3.tgz +``` + +Next, unpack the plugins into the CNI binary directory: + +``` +sudo tar -xvf cni-plugins-linux-amd64-v0.8.3.tgz -C /opt/cni/bin/ +``` + +## Configure CNI Plugins + +This section will show you how to configure CNI plugins. This tutorial will use +the "bridge" and "loopback" plugins which will create the necessary bridge and +loopback devices in our network namespace. However, you should be able to use +any CNI compatible plugin to set up networking for gVisor sandboxes. + +The bridge plugin configuration specifies the IP address subnet range for IP +addresses that will be assigned to sandboxes as well as the network routing +configuration. This tutorial will assign IP addresses from the `10.22.0.0/16` +range and allow all outbound traffic, however you can modify this configuration +to suit your use case. + +Create the bridge and loopback plugin configurations: + +``` +sudo mkdir -p /etc/cni/net.d + +sudo sh -c 'cat > /etc/cni/net.d/10-bridge.conf << EOF +{ + "cniVersion": "0.4.0", + "name": "mynet", + "type": "bridge", + "bridge": "cni0", + "isGateway": true, + "ipMasq": true, + "ipam": { + "type": "host-local", + "subnet": "10.22.0.0/16", + "routes": [ + { "dst": "0.0.0.0/0" } + ] + } +} +EOF' + +sudo sh -c 'cat > /etc/cni/net.d/99-loopback.conf << EOF +{ + "cniVersion": "0.4.0", + "name": "lo", + "type": "loopback" +} +EOF' +``` + +## Create a Network Namespace + +For each gVisor sandbox you will create a network namespace and configure it +using CNI. First, create a random network namespace name and then create +the namespace. + +The network namespace path will then be `/var/run/netns/${CNI_CONTAINERID}`. + +``` +export CNI_PATH=/opt/cni/bin +export CNI_CONTAINERID=$(printf '%x%x%x%x' $RANDOM $RANDOM $RANDOM $RANDOM) +export CNI_COMMAND=ADD +export CNI_NETNS=/var/run/netns/${CNI_CONTAINERID} + +sudo ip netns add ${CNI_CONTAINERID} +``` + +Next, run the bridge and loopback plugins to apply the configuration that was +created earlier to the namespace. Each plugin outputs some JSON indicating the +results of executing the plugin. For example, The bridge plugin's response +includes the IP address assigned to the ethernet device created in the network +namespace. Take note of the IP address for use later. + +``` +export CNI_IFNAME="eth0" +sudo -E /opt/cni/bin/bridge < /etc/cni/net.d/10-bridge.conf +export CNI_IFNAME="lo" +sudo -E /opt/cni/bin/loopback < /etc/cni/net.d/99-loopback.conf +``` + +Get the IP address assigned to our sandbox: + +``` +POD_IP=$(sudo ip netns exec ${CNI_CONTAINERID} ip -4 addr show eth0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}') +``` + +## Create the OCI Bundle + +Now that our network namespace is created and configured, we can create the OCI +bundle for our container. As part of the bundle's `config.json` we will specify +that the container use the network namespace that we created. + +The container will run a simple python webserver that we will be able to +connect to via the IP address assigned to it via the bridge CNI plugin. + +Create the bundle and root filesystem directories: + +``` +sudo mkdir -p bundle +cd bundle +sudo mkdir rootfs +sudo docker export $(docker create python) | sudo tar --same-owner -pxf - -C rootfs +sudo mkdir -p rootfs/var/www/html +sudo sh -c 'echo "Hello World!" > rootfs/var/www/html/index.html' +``` + +Next create the `config.json` specifying the network namespace. +``` +sudo /usr/local/bin/runsc spec +sudo sed -i 's;"sh";"python", "-m", "http.server";' config.json +sudo sed -i "s;\"cwd\": \"/\";\"cwd\": \"/var/www/html\";" config.json +sudo sed -i "s;\"type\": \"network\";\"type\": \"network\",\n\t\t\t\t\"path\": \"/var/run/netns/${CNI_CONTAINERID}\";" config.json +``` + +## Run the Container + +Now we can run and connect to the webserver. Run the container in gVisor. Use +the same ID used for the network namespace to be consistent: + +``` +sudo runsc run -detach ${CNI_CONTAINERID} +``` + +Connect to the server via the sandbox's IP address: + +``` +curl http://${POD_IP}:8000/ +``` + +You should see the server returning `Hello World!`. + +## Cleanup + +After you are finished running the container, you can clean up the network +namespace . + +``` +sudo runsc kill ${CNI_CONTAINERID} +sudo runsc delete ${CNI_CONTAINERID} + +export CNI_COMMAND=DEL + +export CNI_IFNAME="lo" +sudo -E /opt/cni/bin/loopback < /etc/cni/net.d/99-loopback.conf +export CNI_IFNAME="eth0" +sudo -E /opt/cni/bin/bridge < /etc/cni/net.d/10-bridge.conf + +sudo ip netns delete ${CNI_CONTAINERID} +``` diff --git a/g3doc/user_guide/tutorials/docker.md b/g3doc/user_guide/tutorials/docker.md new file mode 100644 index 000000000..514af8489 --- /dev/null +++ b/g3doc/user_guide/tutorials/docker.md @@ -0,0 +1,68 @@ +# WorkPress with Docker + +This page shows you how to deploy a sample [WordPress][wordpress] site using +[Docker][docker]. + +### Before you begin + +[Follow these instructions][docker-install] to install runsc with Docker. +This document assumes that the runtime name chosen is `runsc`. + +### Running WordPress + +Now, let's deploy a WordPress site using Docker. WordPress site requires +two containers: web server in the frontend, MySQL database in the backend. + +First, let's define a few environment variables that are shared between both +containers: + +```bash +export MYSQL_PASSWORD=${YOUR_SECRET_PASSWORD_HERE?} +export MYSQL_DB=wordpress +export MYSQL_USER=wordpress +``` + +Next, let's start the database container running MySQL and wait until the +database is initialized: + +```bash +docker run --runtime=runsc --name mysql -d \ + -e MYSQL_RANDOM_ROOT_PASSWORD=1 \ + -e MYSQL_PASSWORD="${MYSQL_PASSWORD}" \ + -e MYSQL_DATABASE="${MYSQL_DB}" \ + -e MYSQL_USER="${MYSQL_USER}" \ + mysql:5.7 + +# Wait until this message appears in the log. +docker logs mysql |& grep 'port: 3306 MySQL Community Server (GPL)' +``` + +Once the database is running, you can start the WordPress frontend. We use the +`--link` option to connect the frontend to the database, and expose the +WordPress to port 8080 on the localhost. + +```bash +docker run --runtime=runsc --name wordpress -d \ + --link mysql:mysql \ + -p 8080:80 \ + -e WORDPRESS_DB_HOST=mysql \ + -e WORDPRESS_DB_USER="${MYSQL_USER}" \ + -e WORDPRESS_DB_PASSWORD="${MYSQL_PASSWORD}" \ + -e WORDPRESS_DB_NAME="${MYSQL_DB}" \ + -e WORDPRESS_TABLE_PREFIX=wp_ \ + wordpress +``` + +Now, you can access the WordPress website pointing your favorite browser to +. + +Congratulations! You have just deployed a WordPress site using Docker. + +### What's next + +[Learn how to deploy WordPress with Kubernetes][wordpress-k8s]. + +[docker]: https://www.docker.com/ +[docker-install]: /docs/user_guide/quick_start/docker/ +[wordpress]: https://wordpress.com/ +[wordpress-k8s]: /docs/tutorials/kubernetes/ diff --git a/g3doc/user_guide/tutorials/kubernetes.md b/g3doc/user_guide/tutorials/kubernetes.md new file mode 100644 index 000000000..a686c1982 --- /dev/null +++ b/g3doc/user_guide/tutorials/kubernetes.md @@ -0,0 +1,234 @@ +# WordPress with Kubernetes + +This page shows you how to deploy a sample [WordPress][wordpress] site using +[GKE Sandbox][gke-sandbox]. + +### Before you begin + +Take the following steps to enable the Kubernetes Engine API: + +1. Visit the [Kubernetes Engine page][project-selector] in the Google Cloud + Platform Console. +1. Create or select a project. + +### Creating a node pool with gVisor enabled + +Create a node pool inside your cluster with option `--sandbox type=gvisor` added +to the command, like below: + +```bash +gcloud beta container node-pools create sandbox-pool --cluster=${CLUSTER_NAME} --image-type=cos_containerd --sandbox type=gvisor +``` + +If you prefer to use the console, select your cluster and select the **ADD NODE +POOL** button: + +![+ ADD NODE POOL](./node-pool-button.png) + +Then select the **Image type** with **Containerd** and select **Enable sandbox +with gVisor** option. Select other options as you like: + +![+ NODE POOL](./add-node-pool.png) + +### Check that gVisor is enabled + +The gvisor RuntimeClass is instantiated during node creation. You can check for +the existence of the gvisor RuntimeClass using the following command: + +```bash +kubectl get runtimeclasses +``` + +### Wordpress deployment + +Now, let's deploy a WordPress site using GKE Sandbox. WordPress site requires +two pods: web server in the frontend, MySQL database in the backend. Both +applications use PersistentVolumes to store the site data data. +In addition, they use secret store to share MySQL password between them. + +First, let's download the deployment configuration files to add the runtime +class annotation to them: + +```bash +curl -LO https://k8s.io/examples/application/wordpress/wordpress-deployment.yaml +curl -LO https://k8s.io/examples/application/wordpress/mysql-deployment.yaml +``` + +Add a **spec.template.spec.runtimeClassName** set to **gvisor** to both files, +as shown below: + +**wordpress-deployment.yaml:** +```yaml +apiVersion: v1 +kind: Service +metadata: + name: wordpress + labels: + app: wordpress +spec: + ports: + - port: 80 + selector: + app: wordpress + tier: frontend + type: LoadBalancer +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: wp-pv-claim + labels: + app: wordpress +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: wordpress + labels: + app: wordpress +spec: + selector: + matchLabels: + app: wordpress + tier: frontend + strategy: + type: Recreate + template: + metadata: + labels: + app: wordpress + tier: frontend + spec: + runtimeClassName: gvisor # ADD THIS LINE + containers: + - image: wordpress:4.8-apache + name: wordpress + env: + - name: WORDPRESS_DB_HOST + value: wordpress-mysql + - name: WORDPRESS_DB_PASSWORD + valueFrom: + secretKeyRef: + name: mysql-pass + key: password + ports: + - containerPort: 80 + name: wordpress + volumeMounts: + - name: wordpress-persistent-storage + mountPath: /var/www/html + volumes: + - name: wordpress-persistent-storage + persistentVolumeClaim: + claimName: wp-pv-claim +``` + +**mysql-deployment.yaml:** +```yaml +apiVersion: v1 +kind: Service +metadata: + name: wordpress-mysql + labels: + app: wordpress +spec: + ports: + - port: 3306 + selector: + app: wordpress + tier: mysql + clusterIP: None +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: mysql-pv-claim + labels: + app: wordpress +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: wordpress-mysql + labels: + app: wordpress +spec: + selector: + matchLabels: + app: wordpress + tier: mysql + strategy: + type: Recreate + template: + metadata: + labels: + app: wordpress + tier: mysql + spec: + runtimeClassName: gvisor # ADD THIS LINE + containers: + - image: mysql:5.6 + name: mysql + env: + - name: MYSQL_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: mysql-pass + key: password + ports: + - containerPort: 3306 + name: mysql + volumeMounts: + - name: mysql-persistent-storage + mountPath: /var/lib/mysql + volumes: + - name: mysql-persistent-storage + persistentVolumeClaim: + claimName: mysql-pv-claim +``` + +Note that apart from `runtimeClassName: gvisor`, nothing else about the +Deployment has is changed. + +You are now ready to deploy the entire application. Just create a secret to +store MySQL's password and *apply* both deployments: + +```bash +kubectl create secret generic mysql-pass --from-literal=password=${YOUR_SECRET_PASSWORD_HERE?} +kubectl apply -f mysql-deployment.yaml +kubectl apply -f wordpress-deployment.yaml +``` + +Wait for the deployments to be ready and an external IP to be assigned to the +Wordpress service: + +```bash +watch kubectl get service wordpress +``` + +Now, copy the service `EXTERNAL-IP` from above to your favorite browser to view +and configure your new WordPress site. + +Congratulations! You have just deployed a WordPress site using GKE Sandbox. + +### What's next + +To learn more about GKE Sandbox and how to run your deployment securely, take +a look at the [documentation][gke-sandbox-docs]. + +[gke-sandbox-docs]: https://cloud.google.com/kubernetes-engine/docs/how-to/sandbox-pods +[gke-sandbox]: https://cloud.google.com/kubernetes-engine/sandbox/ +[project-selector]: https://console.cloud.google.com/projectselector/kubernetes +[wordpress]: https://wordpress.com/ diff --git a/g3doc/user_guide/tutorials/node-pool-button.png b/g3doc/user_guide/tutorials/node-pool-button.png new file mode 100644 index 000000000..bee0c11dc Binary files /dev/null and b/g3doc/user_guide/tutorials/node-pool-button.png differ -- cgit v1.2.3 From f126de6a28b9d64abcab37392cde812957eb9a82 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 28 Apr 2020 12:47:28 -0700 Subject: Add resource model. --- g3doc/BUILD | 8 +- g3doc/architecture_guide/resources.md | 139 ++++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 7 deletions(-) (limited to 'g3doc') diff --git a/g3doc/BUILD b/g3doc/BUILD index be3653c93..24177ad06 100644 --- a/g3doc/BUILD +++ b/g3doc/BUILD @@ -8,6 +8,7 @@ package( doc( name = "index", src = "README.md", + category = "Project", permalink = "/docs/", weight = "0", ) @@ -20,13 +21,6 @@ doc( weight = "10", ) -doc( - name = "basics", - src = "basics.md", - category = "Project", - permalink = "/docs/basics/", -) - doc( name = "community", src = "community.md", diff --git a/g3doc/architecture_guide/resources.md b/g3doc/architecture_guide/resources.md index 7e45b58a9..3ed5d6355 100644 --- a/g3doc/architecture_guide/resources.md +++ b/g3doc/architecture_guide/resources.md @@ -1 +1,140 @@ # Resource Model + +The resource model for gVisor does not assume a fixed number of threads of +execution (i.e. vCPUs) or amount of physical memory. Where possible, decisions +about underlying physical resources are delegated to the host system, where +optimizations can be made with global information. This delegation allows the +sandbox to be highly dynamic in terms of resource usage: spanning a large number +of cores and large amount of memory when busy, and yielding those resources back +to the host when not. + +Some of the details here may depend on the [platform](../platforms/), but in +general this page describes the resource model used by gVisor. If you're not +familiar with the terms here, uou may want to start with the [Overview](../). + +## Processes + +Much like a Virtual Machine (VM), a gVisor sandbox appears as an opaque process +on the system. Processes within the sandbox do not manifest as processes on the +host system, and process-level interactions within the sandbox requires entering +the sandbox (e.g. via a [Docker exec][exec]). + +## Networking + +Similarly to processes, the sandbox attaches a network endpoint to the system, +but runs it's own network stack. All network resources, other than packets in +flight, exist only inside the sandbox, bound by relevant resource limits. + +You can interact with network endpoints exposed by the sandbox, just as you +would any other container, but network introspection similarly requires entering +the sandbox. + +## Files + +Files may be backed by different implementations. For host-native files (where a +file descriptor is available), the Gofer may return a file descriptor to the +Sentry via [SCM_RIGHTS][scmrights][^1]. + +These files may be read from and written to through standard system calls, and +also mapped into the associated application's address space. This allows the +same host memory to be shared across multiple sandboxes, although this mechanism +does not preclude the use of side-channels (see the [security +model](../security/)). + +Note that some file systems exist only within the context of the sandbox. For +example, in many cases a `tmpfs` mount will be available at `/tmp` or +`/dev/shm`, which allocates memory directly from the sandbox memory file (see +below). Ultimately, these will be accounted against relevant limits in a similar +way as the host native case. + +## Threads + +The Sentry models individual task threads with [goroutines][goroutine]. As a +result, each task thread is a lightweight [green thread][greenthread], and may +not correspond to an underlying host thread. + +However, application execution is modelled as a blocking system call with the +Sentry. This means that additional host threads may be created, *depending on +the number of active application threads*. In practice, a busy application will +converge on the number of active threads, and the host will be able to make +scheduling decisions about all application threads. + +## Time + +Time in the sandbox is provided by the Sentry, through its own [vDSO][vdso] and +timekeeping implementation. This is divorced from the host time, and no state is +shared with the host, although the time will be initialized with the host clock. + +The Sentry runs timers to note the passage of time, much like a kernel running +on hardware (though the timers are software timers, in this case). These timers +provide updates to the vDSO, the time returned through system calls, and the +time recorded for usage or limit tracking (e.g. [RLIMIT_CPU][rlimit]). + +When all application threads are idle, the Sentry disables timers until an event +occurs that wakes either the Sentry or an application thread, similar to a +[tickless kernel][tickless]. This allows the Sentry to achieve near zero CPU +usage for idle applications. + +## Memory + +The Sentry implements its own memory management, including demand-paging and a +Sentry internal page cache for files that cannot be used natively. A single +[memfd][memfd] backs all application memory. + +### Address spaces + +The creation of address spaces is platform-specific. For some platforms, +additional "stub" processes may be created on the host in order to support +additional address spaces. These stubs are subject to various limits applied at +the sandbox level (e.g. PID limits). + +### Physical memory + +The host is able to manage physical memory using regular means (e.g. tracking +working sets, reclaiming and swapping under pressure). The Sentry lazily +populates host mappings for applications, and allow the host to demand-page +those regions, which is critical for the functioning of those mechanisms. + +In order to avoid excessive overhead, the Sentry does not demand-page individual +pages. Instead, it selects appropriate regions based on heuristics. There is a +trade-off here: the Sentry is unable to trivially determine which pages are +active and which are not. Even if pages were individually faulted, the host may +select pages to be reclaimed or swapped without the Sentry's knowledge. + +Therefore, memory usage statistics within the sandbox (e.g. via `proc`) are +approximations. The Sentry maintains an internal breakdown of memory usage, and +can collect accurate information but only through a relatively expensive API +call. In any case, it would likely be considered unwise to share precise +information about how the host is managing memory with the sandbox. + +Finally, when an application marks a region of memory as no longer needed, for +example via a call to [madvise][madvise], the Sentry *releases this memory back +to the host*. There can be performance penalties for this, since it may be +cheaper in many cases to retain the memory and use it to satisfy some other +request. However, releasing it immediately to the host allows the host to more +effectively multiplex resources and apply an efficient global policy. + +## Limits + +All Sentry threads and Sentry memory are subject to a container cgroup. However, +application usage will not appear as anonymous memory usage, and will instead be +accounted to the `memfd`. All anonymous memory will correspond to Sentry usage, +and host memory charged to the container will work as standard. + +The cgroups can be monitored for standard signals: pressure indicators, +threshold notifiers, etc. and can also be adjusted dynamically. Note that the +Sentry itself may listen for pressure signals in its containing cgroup, in order +to purge internal caches. + +[goroutine]: https://tour.golang.org/concurrency/1 +[greenthread]: https://en.wikipedia.org/wiki/Green_threads +[scheduler]: https://morsmachine.dk/go-scheduler +[vdso]: https://en.wikipedia.org/wiki/VDSO +[rlimit]: http://man7.org/linux/man-pages/man2/getrlimit.2.html +[tickless]: https://en.wikipedia.org/wiki/Tickless_kernel +[memfd]: http://man7.org/linux/man-pages/man2/memfd_create.2.html +[scmrights]: http://man7.org/linux/man-pages/man7/unix.7.html +[madvise]: http://man7.org/linux/man-pages/man2/madvise.2.html +[exec]: https://docs.docker.com/engine/reference/commandline/exec/ + +[^1]: Unless host networking is enabled, the Sentry is not able to create or open host file descriptors itself, it can only receive them in this way from the Gofer. -- cgit v1.2.3 From 3cb00c97e953df3d1970e1462d8dbcd47d496d61 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 28 Apr 2020 12:51:24 -0700 Subject: Add note about AArch64 support. --- g3doc/user_guide/FAQ.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'g3doc') diff --git a/g3doc/user_guide/FAQ.md b/g3doc/user_guide/FAQ.md index a84ac3c48..7707217fb 100644 --- a/g3doc/user_guide/FAQ.md +++ b/g3doc/user_guide/FAQ.md @@ -7,7 +7,8 @@ Today, gVisor requires Linux. ### What CPU architectures are supported? {#supported-cpus} gVisor currently supports [x86_64/AMD64](https://en.wikipedia.org/wiki/X86-64) -compatible processors. +compatible processors. Preliminary support is also available for +[ARM64](https://en.wikipedia.org/wiki/ARM_architecture#AArch64). ### Do I need to modify my Linux application to use gVisor? {#modify-app} @@ -17,8 +18,11 @@ No. gVisor is capable of running unmodified Linux binaries. gVisor supports Linux [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) binaries. + Binaries run in gVisor should be built for the -[AMD64](https://en.wikipedia.org/wiki/X86-64) CPU architecture. +[AMD64](https://en.wikipedia.org/wiki/X86-64) or +[AArch64](https://en.wikipedia.org/wiki/ARM_architecture#AArch64) CPU +architectures. ### Can I run Docker images using gVisor? {#docker-images} -- cgit v1.2.3 From 5f3a256425f4fa99fd3e5363418c5978659cecf3 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Wed, 29 Apr 2020 18:54:48 -0700 Subject: Add support for kramdown TOC. --- g3doc/architecture_guide/performance.md | 2 ++ g3doc/architecture_guide/resources.md | 2 ++ g3doc/architecture_guide/security.md | 2 ++ g3doc/user_guide/FAQ.md | 2 ++ g3doc/user_guide/checkpoint_restore.md | 2 ++ g3doc/user_guide/compatibility.md | 2 ++ g3doc/user_guide/debugging.md | 2 ++ g3doc/user_guide/filesystem.md | 2 ++ g3doc/user_guide/install.md | 6 ++++-- g3doc/user_guide/networking.md | 2 ++ g3doc/user_guide/platforms.md | 2 ++ website/defs.bzl | 9 ++++++++- 12 files changed, 32 insertions(+), 3 deletions(-) (limited to 'g3doc') diff --git a/g3doc/architecture_guide/performance.md b/g3doc/architecture_guide/performance.md index fd219be5e..2f83c0d20 100644 --- a/g3doc/architecture_guide/performance.md +++ b/g3doc/architecture_guide/performance.md @@ -1,5 +1,7 @@ # Performance Guide +[TOC] + gVisor is designed to provide a secure, virtualized environment while preserving key benefits of containerization, such as small fixed overheads and a dynamic resource footprint. For containerized infrastructure, this can provide a diff --git a/g3doc/architecture_guide/resources.md b/g3doc/architecture_guide/resources.md index 3ed5d6355..4580bf9f4 100644 --- a/g3doc/architecture_guide/resources.md +++ b/g3doc/architecture_guide/resources.md @@ -1,5 +1,7 @@ # Resource Model +[TOC] + The resource model for gVisor does not assume a fixed number of threads of execution (i.e. vCPUs) or amount of physical memory. Where possible, decisions about underlying physical resources are delegated to the host system, where diff --git a/g3doc/architecture_guide/security.md b/g3doc/architecture_guide/security.md index 59003f0a8..afafe5c05 100644 --- a/g3doc/architecture_guide/security.md +++ b/g3doc/architecture_guide/security.md @@ -1,5 +1,7 @@ # Security Model +[TOC] + gVisor was created in order to provide additional defense against the exploitation of kernel bugs by untrusted userspace code. In order to understand how gVisor achieves this goal, it is first necessary to understand the basic diff --git a/g3doc/user_guide/FAQ.md b/g3doc/user_guide/FAQ.md index 7707217fb..9eb9f4501 100644 --- a/g3doc/user_guide/FAQ.md +++ b/g3doc/user_guide/FAQ.md @@ -1,5 +1,7 @@ # FAQ +[TOC] + ### What operating systems are supported? {#supported-os} Today, gVisor requires Linux. diff --git a/g3doc/user_guide/checkpoint_restore.md b/g3doc/user_guide/checkpoint_restore.md index 1814a2799..b0aa308f3 100644 --- a/g3doc/user_guide/checkpoint_restore.md +++ b/g3doc/user_guide/checkpoint_restore.md @@ -1,5 +1,7 @@ # Checkpoint/Restore +[TOC] + gVisor has the ability to checkpoint a process, save its current state in a state file, and restore into a new container using the state file. diff --git a/g3doc/user_guide/compatibility.md b/g3doc/user_guide/compatibility.md index 5fe9fc1e8..30c787e75 100644 --- a/g3doc/user_guide/compatibility.md +++ b/g3doc/user_guide/compatibility.md @@ -1,5 +1,7 @@ # Applications +[TOC] + gVisor implements a large portion of the Linux surface and while we strive to make it broadly compatible, there are (and always will be) unimplemented features and bugs. The only real way to know if it will work is to try. If you diff --git a/g3doc/user_guide/debugging.md b/g3doc/user_guide/debugging.md index a7c3138d7..38e26db76 100644 --- a/g3doc/user_guide/debugging.md +++ b/g3doc/user_guide/debugging.md @@ -1,5 +1,7 @@ # Debugging +[TOC] + To enable debug and system call logging, add the `runtimeArgs` below to your [Docker](../quick_start/docker/) configuration (`/etc/docker/daemon.json`): diff --git a/g3doc/user_guide/filesystem.md b/g3doc/user_guide/filesystem.md index 13bc07ab1..50a1c0020 100644 --- a/g3doc/user_guide/filesystem.md +++ b/g3doc/user_guide/filesystem.md @@ -1,5 +1,7 @@ # Filesystem +[TOC] + gVisor accesses the filesystem through a file proxy, called the Gofer. The gofer runs as a separate process, that is isolated from the sandbox. Gofer instances communicate with their respective sentry using the 9P protocol. For a more detailed diff --git a/g3doc/user_guide/install.md b/g3doc/user_guide/install.md index 28422612e..a4cb926f5 100644 --- a/g3doc/user_guide/install.md +++ b/g3doc/user_guide/install.md @@ -1,7 +1,9 @@ # Installation --> Note: gVisor supports only x86\_64 and requires Linux 4.14.77+ --> ([older Linux](./networking.md#gso)). +[TOC] + +> Note: gVisor supports only x86\_64 and requires Linux 4.14.77+ +> ([older Linux](./networking.md#gso)). ## Versions diff --git a/g3doc/user_guide/networking.md b/g3doc/user_guide/networking.md index 26c76e8aa..348b66bfd 100644 --- a/g3doc/user_guide/networking.md +++ b/g3doc/user_guide/networking.md @@ -1,5 +1,7 @@ # Networking +[TOC] + gVisor implements its own network stack called [netstack][netstack]. All aspects of the network stack are handled inside the Sentry — including TCP connection state, control messages, and packet assembly — keeping it isolated from the host diff --git a/g3doc/user_guide/platforms.md b/g3doc/user_guide/platforms.md index fb48db34f..f13092016 100644 --- a/g3doc/user_guide/platforms.md +++ b/g3doc/user_guide/platforms.md @@ -1,5 +1,7 @@ # Platforms (KVM) +[TOC] + This document will help you set up your system to use a different gVisor platform. diff --git a/website/defs.bzl b/website/defs.bzl index fe711d5d1..64a9d43e3 100644 --- a/website/defs.bzl +++ b/website/defs.bzl @@ -130,7 +130,14 @@ layout: {layout}""" builder_content += [header.format(**args)] builder_content += ["---"] builder_content += ["EOF"] - builder_content += ["grep -v -E '^# ' %s >>$T/%s || true" % (f.path, f.short_path)] + + # To generate the final page, we need to strip out the title (which + # was pulled above to generate the annotation in the frontmatter, + # and substitute the [TOC] tag with the {% toc %} plugin tag. Note + # that the pipeline here is almost important, as the grep will + # return non-zero if the file is empty, but we ignore that within + # the pipeline. + builder_content += ["grep -v -E '^# ' %s | sed -e 's|^\\[TOC\\]$|- TOC\\n{:toc}|' >>$T/%s" % (f.path, f.short_path)] builder_content += ["declare -r filename=$(readlink -m %s)" % tarball.path] builder_content += ["(cd $T && tar -zcf \"${filename}\" .)\n"] -- cgit v1.2.3 From 1847165a8c034e82cb35a0dc23878921cab30b5d Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Fri, 15 May 2020 09:29:52 -0700 Subject: Minor text updates and jquery ordering. PiperOrigin-RevId: 311744091 --- g3doc/architecture_guide/README.md | 3 +++ g3doc/user_guide/quick_start/docker.md | 2 +- g3doc/user_guide/quick_start/kubernetes.md | 2 +- g3doc/user_guide/quick_start/oci.md | 2 +- website/_includes/footer.html | 2 +- website/_layouts/docs.html | 2 +- website/index.md | 2 +- 7 files changed, 9 insertions(+), 6 deletions(-) (limited to 'g3doc') diff --git a/g3doc/architecture_guide/README.md b/g3doc/architecture_guide/README.md index 1364a5358..ab9ef7174 100644 --- a/g3doc/architecture_guide/README.md +++ b/g3doc/architecture_guide/README.md @@ -71,6 +71,9 @@ race detector. (The use of Go has its challenges too, and isn't free.) +Gofers mediate file system interactions, and are used to provide additional +isolation. For more details, see the [Platform Guide](./platforms.md). + [apparmor]: https://wiki.ubuntu.com/AppArmor [golang]: https://golang.org [kvm]: https://www.linux-kvm.org diff --git a/g3doc/user_guide/quick_start/docker.md b/g3doc/user_guide/quick_start/docker.md index 5228db4c0..fa8b9076b 100644 --- a/g3doc/user_guide/quick_start/docker.md +++ b/g3doc/user_guide/quick_start/docker.md @@ -1,4 +1,4 @@ -# Docker +# Docker Quick Start > Note: This guide requires Docker version 17.09.0 or greater. Refer to the > [Docker documentation][docker] for how to install it. diff --git a/g3doc/user_guide/quick_start/kubernetes.md b/g3doc/user_guide/quick_start/kubernetes.md index b1f67252e..f875d8002 100644 --- a/g3doc/user_guide/quick_start/kubernetes.md +++ b/g3doc/user_guide/quick_start/kubernetes.md @@ -1,4 +1,4 @@ -# Kubernetes +# Kubernetes Quick Start gVisor can be used to run Kubernetes pods and has several integration points with Kubernetes. diff --git a/g3doc/user_guide/quick_start/oci.md b/g3doc/user_guide/quick_start/oci.md index 57bcc4f63..877169145 100644 --- a/g3doc/user_guide/quick_start/oci.md +++ b/g3doc/user_guide/quick_start/oci.md @@ -1,4 +1,4 @@ -# OCI +# OCI Quick Start This guide will quickly get you started running your first gVisor sandbox container using the runtime directly with the default platform. diff --git a/website/_includes/footer.html b/website/_includes/footer.html index 5d9267f35..9cc8176f7 100644 --- a/website/_includes/footer.html +++ b/website/_includes/footer.html @@ -2,9 +2,9 @@ {% include footer-links.html %} + - {% if site.analytics %} diff --git a/website/_layouts/docs.html b/website/_layouts/docs.html index e11492915..33ea8e1de 100644 --- a/website/_layouts/docs.html +++ b/website/_layouts/docs.html @@ -47,7 +47,7 @@ categories:

{{ page.title }}

{% if page.editpath %}

- Edit this page + Edit this page Create issue

{% endif %} diff --git a/website/index.md b/website/index.md index 34d3ee23d..95d5d16f0 100644 --- a/website/index.md +++ b/website/index.md @@ -43,7 +43,7 @@ The pluggable platform architecture of gVisor allows it to run anywhere, enabling consistent security policies across multiple environments without having to rearchitect your infrastructure.

- Read More » + Get Started » -- cgit v1.2.3 From 420b791a3d6e0e6e2fc30c6f8be013bce7ca6549 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Fri, 15 May 2020 20:03:54 -0700 Subject: Minor formatting updates for gvisor.dev. * Aggregate architecture Overview in "What is gVisor?" as it makes more sense in one place. * Drop "user-space kernel" and use "application kernel". The term "user-space kernel" is confusing when some platform implementation do not run in user-space (instead running in guest ring zero). * Clear up the relationship between the Platform page in the user guide and the Platform page in the architecture guide, and ensure they are cross-linked. * Restore the call-to-action quick start link in the main page, and drop the GitHub link (which also appears in the top-right). * Improve image formatting by centering all doc and blog images, and move the image captions to the alt text. PiperOrigin-RevId: 311845158 --- README.md | 79 ++--- g3doc/BUILD | 4 + g3doc/Layers.png | Bin 0 -> 11044 bytes g3doc/Layers.svg | 1 + g3doc/Machine-Virtualization.png | Bin 0 -> 13205 bytes g3doc/Machine-Virtualization.svg | 1 + g3doc/README.md | 161 +++++++++- g3doc/Rule-Based-Execution.png | Bin 0 -> 6780 bytes g3doc/Rule-Based-Execution.svg | 1 + g3doc/Sentry-Gofer.png | Bin 0 -> 9064 bytes g3doc/Sentry-Gofer.svg | 1 + g3doc/architecture_guide/BUILD | 30 +- g3doc/architecture_guide/Layers.png | Bin 11044 -> 0 bytes g3doc/architecture_guide/Layers.svg | 1 - .../architecture_guide/Machine-Virtualization.png | Bin 13205 -> 0 bytes .../architecture_guide/Machine-Virtualization.svg | 1 - g3doc/architecture_guide/README.md | 83 ----- g3doc/architecture_guide/Rule-Based-Execution.png | Bin 6780 -> 0 bytes g3doc/architecture_guide/Rule-Based-Execution.svg | 1 - g3doc/architecture_guide/Sentry-Gofer.png | Bin 9064 -> 0 bytes g3doc/architecture_guide/Sentry-Gofer.svg | 1 - g3doc/architecture_guide/performance.md | 35 ++- g3doc/architecture_guide/platforms.md | 109 +++---- g3doc/architecture_guide/platforms.png | Bin 0 -> 21384 bytes g3doc/architecture_guide/platforms.svg | 334 +++++++++++++++++++++ g3doc/architecture_guide/resources.md | 27 +- g3doc/architecture_guide/resources.png | Bin 0 -> 16621 bytes g3doc/architecture_guide/resources.svg | 208 +++++++++++++ g3doc/architecture_guide/security.md | 28 +- g3doc/architecture_guide/security.png | Bin 0 -> 16932 bytes g3doc/architecture_guide/security.svg | 153 ++++++++++ g3doc/user_guide/filesystem.md | 4 +- g3doc/user_guide/platforms.md | 100 +++--- pkg/sentry/arch/syscalls_arm64.go | 2 +- pkg/sentry/kernel/pipe/pipe_util.go | 2 +- pkg/sentry/kernel/task_syscall.go | 4 +- pkg/sentry/socket/netstack/netstack.go | 6 +- pkg/tcpip/stack/stack.go | 4 +- pkg/tcpip/transport/tcp/endpoint.go | 2 +- pkg/tcpip/transport/tcp/tcp_test.go | 2 +- runsc/cmd/help.go | 12 +- website/BUILD | 1 - website/_layouts/docs.html | 2 + website/_sass/front.scss | 4 +- website/_sass/style.scss | 10 + website/blog/2019-11-18-security-basics.md | 28 +- website/blog/2020-04-02-networking-security.md | 8 +- website/index.md | 10 +- 48 files changed, 1054 insertions(+), 406 deletions(-) create mode 100644 g3doc/Layers.png create mode 100644 g3doc/Layers.svg create mode 100644 g3doc/Machine-Virtualization.png create mode 100644 g3doc/Machine-Virtualization.svg create mode 100644 g3doc/Rule-Based-Execution.png create mode 100644 g3doc/Rule-Based-Execution.svg create mode 100644 g3doc/Sentry-Gofer.png create mode 100644 g3doc/Sentry-Gofer.svg delete mode 100644 g3doc/architecture_guide/Layers.png delete mode 100644 g3doc/architecture_guide/Layers.svg delete mode 100644 g3doc/architecture_guide/Machine-Virtualization.png delete mode 100644 g3doc/architecture_guide/Machine-Virtualization.svg delete mode 100644 g3doc/architecture_guide/README.md delete mode 100644 g3doc/architecture_guide/Rule-Based-Execution.png delete mode 100644 g3doc/architecture_guide/Rule-Based-Execution.svg delete mode 100644 g3doc/architecture_guide/Sentry-Gofer.png delete mode 100644 g3doc/architecture_guide/Sentry-Gofer.svg create mode 100644 g3doc/architecture_guide/platforms.png create mode 100644 g3doc/architecture_guide/platforms.svg create mode 100644 g3doc/architecture_guide/resources.png create mode 100644 g3doc/architecture_guide/resources.svg create mode 100644 g3doc/architecture_guide/security.png create mode 100644 g3doc/architecture_guide/security.svg (limited to 'g3doc') diff --git a/README.md b/README.md index de3e06f4e..442f5672a 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ## What is gVisor? -**gVisor** is a user-space kernel, written in Go, that implements a substantial +**gVisor** is a application kernel, written in Go, that implements a substantial portion of the Linux system surface. It includes an [Open Container Initiative (OCI)][oci] runtime called `runsc` that provides an isolation boundary between the application and the host kernel. The `runsc` @@ -15,16 +15,17 @@ containers. ## Why does gVisor exist? Containers are not a [**sandbox**][sandbox]. While containers have -revolutionized how we develop, package, and deploy applications, running -untrusted or potentially malicious code without additional isolation is not a -good idea. The efficiency and performance gains from using a single, shared -kernel also mean that container escape is possible with a single vulnerability. - -gVisor is a user-space kernel for containers. It limits the host kernel surface -accessible to the application while still giving the application access to all -the features it expects. Unlike most kernels, gVisor does not assume or require -a fixed set of physical resources; instead, it leverages existing host kernel -functionality and runs as a normal user-space process. In other words, gVisor +revolutionized how we develop, package, and deploy applications, using them to +run untrusted or potentially malicious code without additional isolation is not +a good idea. While using a single, shared kernel allows for efficiency and +performance gains, it also means that container escape is possible with a single +vulnerability. + +gVisor is an application kernel for containers. It limits the host kernel +surface accessible to the application while still giving the application access +to all the features it expects. Unlike most kernels, gVisor does not assume or +require a fixed set of physical resources; instead, it leverages existing host +kernel functionality and runs as a normal process. In other words, gVisor implements Linux by way of Linux. gVisor should not be confused with technologies and tools to harden containers @@ -39,33 +40,24 @@ be found at [gvisor.dev][gvisor-dev]. ## Installing from source -gVisor currently requires x86\_64 Linux to build, though support for other -architectures may become available in the future. +gVisor builds on x86_64 and ARM64. Other architectures may become available in +the future. + +For the purposes of these instructions, [bazel][bazel] and other build +dependencies are wrapped in a build container. It is possible to use +[bazel][bazel] directly, or type `make help` for standard targets. ### Requirements Make sure the following dependencies are installed: * Linux 4.14.77+ ([older linux][old-linux]) -* [git][git] -* [Bazel][bazel] 1.2+ -* [Python][python] * [Docker version 17.09.0 or greater][docker] -* C++ toolchain supporting C++17 (GCC 7+, Clang 5+) -* Gold linker (e.g. `binutils-gold` package on Ubuntu) ### Building Build and install the `runsc` binary: -``` -bazel build runsc -sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin -``` - -If you don't want to install bazel on your system, you can build runsc in a -Docker container: - ``` make runsc sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin @@ -73,41 +65,19 @@ sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin ### Testing -The test suite can be run with Bazel: - -``` -bazel test //... -``` - -or in a Docker container: +To run standard test suites, you can use: ``` make unit-tests make tests ``` -### Using remote execution - -If you have a [Remote Build Execution][rbe] environment, you can use it to speed -up build and test cycles. - -You must authenticate with the project first: +To run specific tests, you can specify the target: ``` -gcloud auth application-default login --no-launch-browser +make test TARGET="//runsc:version_test" ``` -Then invoke bazel with the following flags: - -``` ---config=remote ---project_id=$PROJECT ---remote_instance_name=projects/$PROJECT/instances/default_instance -``` - -You can also add those flags to your local ~/.bazelrc to avoid needing to -specify them each time on the command line. - ### Using `go get` This project uses [bazel][bazel] to build and manage dependencies. A synthetic @@ -128,7 +98,7 @@ development on this branch is not supported. Development should occur on the ## Community & Governance -The governance model is documented in our [community][community] repository. +See [GOVERNANCE.md](GOVERANCE.md) for project governance information. The [gvisor-users mailing list][gvisor-users-list] and [gvisor-dev mailing list][gvisor-dev-list] are good starting points for @@ -145,12 +115,9 @@ See [Contributing.md](CONTRIBUTING.md). [bazel]: https://bazel.build [community]: https://gvisor.googlesource.com/community [docker]: https://www.docker.com -[git]: https://git-scm.com [gvisor-users-list]: https://groups.google.com/forum/#!forum/gvisor-users +[gvisor-dev]: https://gvisor.dev [gvisor-dev-list]: https://groups.google.com/forum/#!forum/gvisor-dev [oci]: https://www.opencontainers.org [old-linux]: https://gvisor.dev/docs/user_guide/networking/#gso -[python]: https://python.org -[rbe]: https://blog.bazel.build/2018/10/05/remote-build-execution.html [sandbox]: https://en.wikipedia.org/wiki/Sandbox_(computer_security) -[gvisor-dev]: https://gvisor.dev diff --git a/g3doc/BUILD b/g3doc/BUILD index 24177ad06..dbbf96204 100644 --- a/g3doc/BUILD +++ b/g3doc/BUILD @@ -9,6 +9,10 @@ doc( name = "index", src = "README.md", category = "Project", + data = glob([ + "*.png", + "*.svg", + ]), permalink = "/docs/", weight = "0", ) diff --git a/g3doc/Layers.png b/g3doc/Layers.png new file mode 100644 index 000000000..308c6c451 Binary files /dev/null and b/g3doc/Layers.png differ diff --git a/g3doc/Layers.svg b/g3doc/Layers.svg new file mode 100644 index 000000000..0a366f841 --- /dev/null +++ b/g3doc/Layers.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/Machine-Virtualization.png b/g3doc/Machine-Virtualization.png new file mode 100644 index 000000000..1ba2ed6b2 Binary files /dev/null and b/g3doc/Machine-Virtualization.png differ diff --git a/g3doc/Machine-Virtualization.svg b/g3doc/Machine-Virtualization.svg new file mode 100644 index 000000000..5352da07b --- /dev/null +++ b/g3doc/Machine-Virtualization.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/README.md b/g3doc/README.md index 7999f5d47..304a91493 100644 --- a/g3doc/README.md +++ b/g3doc/README.md @@ -1,6 +1,6 @@ # What is gVisor? -gVisor is a user-space kernel, written in Go, that implements a substantial +gVisor is an application kernel, written in Go, that implements a substantial portion of the [Linux system call interface][linux]. It provides an additional layer of isolation between running applications and the host operating system. @@ -9,19 +9,160 @@ that makes it easy to work with existing container tooling. The `runsc` runtime integrates with Docker and Kubernetes, making it simple to run sandboxed containers. -gVisor takes a distinct approach to container sandboxing and makes a different -set of technical trade-offs compared to existing sandbox technologies, thus -providing new tools and ideas for the container security landscape. - gVisor can be used with Docker, Kubernetes, or directly using `runsc`. Use the links below to see detailed instructions for each of them: -* [Docker](./user_guide/quick_start/docker/): The quickest and easiest way to - get started. -* [Kubernetes](./user_guide/quick_start/kubernetes/): Isolate Pods in your K8s - cluster with gVisor. -* [OCI Quick Start](./user_guide/quick_start/oci/): Expert mode. Customize +* [Docker](./user_guide/quick_start/docker.md): The quickest and easiest way + to get started. +* [Kubernetes](./user_guide/quick_start/kubernetes.md): Isolate Pods in your + K8s cluster with gVisor. +* [OCI Quick Start](./user_guide/quick_start/oci.md): Expert mode. Customize gVisor for your environment. +## What does gVisor do? + +gVisor provides a virtualized environment in order to sandbox containers. The +system interfaces normally implemented by the host kernel are moved into a +distinct, per-sandbox application kernel in order to minimize the risk of an +container escape exploit. gVisor does not introduce large fixed overheads +however, and still retains a process-like model with respect to resource +utilization. + +## How is this different? + +Two other approaches are commonly taken to provide stronger isolation than +native containers. + +**Machine-level virtualization**, such as [KVM][kvm] and [Xen][xen], exposes +virtualized hardware to a guest kernel via a Virtual Machine Monitor (VMM). This +virtualized hardware is generally enlightened (paravirtualized) and additional +mechanisms can be used to improve the visibility between the guest and host +(e.g. balloon drivers, paravirtualized spinlocks). Running containers in +distinct virtual machines can provide great isolation, compatibility and +performance (though nested virtualization may bring challenges in this area), +but for containers it often requires additional proxies and agents, and may +require a larger resource footprint and slower start-up times. + +![Machine-level virtualization](Machine-Virtualization.png "Machine-level virtualization") + +**Rule-based execution**, such as [seccomp][seccomp], [SELinux][selinux] and +[AppArmor][apparmor], allows the specification of a fine-grained security policy +for an application or container. These schemes typically rely on hooks +implemented inside the host kernel to enforce the rules. If the surface can be +made small enough, then this is an excellent way to sandbox applications and +maintain native performance. However, in practice it can be extremely difficult +(if not impossible) to reliably define a policy for arbitrary, previously +unknown applications, making this approach challenging to apply universally. + +![Rule-based execution](Rule-Based-Execution.png "Rule-based execution") + +Rule-based execution is often combined with additional layers for +defense-in-depth. + +**gVisor** provides a third isolation mechanism, distinct from those above. + +gVisor intercepts application system calls and acts as the guest kernel, without +the need for translation through virtualized hardware. gVisor may be thought of +as either a merged guest kernel and VMM, or as seccomp on steroids. This +architecture allows it to provide a flexible resource footprint (i.e. one based +on threads and memory mappings, not fixed guest physical resources) while also +lowering the fixed costs of virtualization. However, this comes at the price of +reduced application compatibility and higher per-system call overhead. + +![gVisor](Layers.png "gVisor") + +On top of this, gVisor employs rule-based execution to provide defense-in-depth +(details below). + +gVisor's approach is similar to [User Mode Linux (UML)][uml], although UML +virtualizes hardware internally and thus provides a fixed resource footprint. + +Each of the above approaches may excel in distinct scenarios. For example, +machine-level virtualization will face challenges achieving high density, while +gVisor may provide poor performance for system call heavy workloads. + +## Why Go? + +gVisor is written in [Go][golang] in order to avoid security pitfalls that can +plague kernels. With Go, there are strong types, built-in bounds checks, no +uninitialized variables, no use-after-free, no stack overflow, and a built-in +race detector. However, the use of Go has its challenges, and the runtime often +introduces performance overhead. + +## What are the different components? + +A gVisor sandbox consists of multiple processes. These processes collectively +comprise an environment in which one or more containers can be run. + +Each sandbox has its own isolated instance of: + +* The **Sentry**, which is a kernel that runs the containers and intercepts + and responds to system calls made by the application. + +Each container running in the sandbox has its own isolated instance of: + +* A **Gofer** which provides file system access to the containers. + +![gVisor architecture diagram](Sentry-Gofer.png "gVisor architecture diagram") + +## What is runsc? + +The entrypoint to running a sandboxed container is the `runsc` executable. +`runsc` implements the [Open Container Initiative (OCI)][oci] runtime +specification, which is used by Docker and Kubernetes. This means that OCI +compatible _filesystem bundles_ can be run by `runsc`. Filesystem bundles are +comprised of a `config.json` file containing container configuration, and a root +filesystem for the container. Please see the [OCI runtime spec][runtime-spec] +for more information on filesystem bundles. `runsc` implements multiple commands +that perform various functions such as starting, stopping, listing, and querying +the status of containers. + +### Sentry + + + +The Sentry is the largest component of gVisor. It can be thought of as a +application kernel. The Sentry implements all the kernel functionality needed by +the application, including: system calls, signal delivery, memory management and +page faulting logic, the threading model, and more. + +When the application makes a system call, the +[Platform](./architecture_guide/platforms.md) redirects the call to the Sentry, +which will do the necessary work to service it. It is important to note that the +Sentry does not pass system calls through to the host kernel. As a userspace +application, the Sentry will make some host system calls to support its +operation, but it does not allow the application to directly control the system +calls it makes. For example, the Sentry is not able to open files directly; file +system operations that extend beyond the sandbox (not internal `/proc` files, +pipes, etc) are sent to the Gofer, described below. + +### Gofer + + + +The Gofer is a standard host process which is started with each container and +communicates with the Sentry via the [9P protocol][9p] over a socket or shared +memory channel. The Sentry process is started in a restricted seccomp container +without access to file system resources. The Gofer mediates all access to the +these resources, providing an additional level of isolation. + +### Application + +The application is a normal Linux binary provided to gVisor in an OCI runtime +bundle. gVisor aims to provide an environment equivalent to Linux v4.4, so +applications should be able to run unmodified. However, gVisor does not +presently implement every system call, `/proc` file, or `/sys` file so some +incompatibilities may occur. See [Commpatibility](./user_guide/compatibility.md) +for more information. + +[9p]: https://en.wikipedia.org/wiki/9P_(protocol) +[apparmor]: https://wiki.ubuntu.com/AppArmor +[golang]: https://golang.org +[kvm]: https://www.linux-kvm.org [linux]: https://en.wikipedia.org/wiki/Linux_kernel_interfaces [oci]: https://www.opencontainers.org +[runtime-spec]: https://github.com/opencontainers/runtime-spec +[seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt +[selinux]: https://selinuxproject.org +[uml]: http://user-mode-linux.sourceforge.net/ +[xen]: https://www.xenproject.org diff --git a/g3doc/Rule-Based-Execution.png b/g3doc/Rule-Based-Execution.png new file mode 100644 index 000000000..b42654a90 Binary files /dev/null and b/g3doc/Rule-Based-Execution.png differ diff --git a/g3doc/Rule-Based-Execution.svg b/g3doc/Rule-Based-Execution.svg new file mode 100644 index 000000000..bd6717043 --- /dev/null +++ b/g3doc/Rule-Based-Execution.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/Sentry-Gofer.png b/g3doc/Sentry-Gofer.png new file mode 100644 index 000000000..ca2c27ef7 Binary files /dev/null and b/g3doc/Sentry-Gofer.png differ diff --git a/g3doc/Sentry-Gofer.svg b/g3doc/Sentry-Gofer.svg new file mode 100644 index 000000000..5c10750d2 --- /dev/null +++ b/g3doc/Sentry-Gofer.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/g3doc/architecture_guide/BUILD b/g3doc/architecture_guide/BUILD index 72038305b..404f627a4 100644 --- a/g3doc/architecture_guide/BUILD +++ b/g3doc/architecture_guide/BUILD @@ -5,31 +5,13 @@ package( licenses = ["notice"], ) -doc( - name = "index", - src = "README.md", - category = "Architecture Guide", - data = [ - "Layers.png", - "Layers.svg", - "Machine-Virtualization.png", - "Machine-Virtualization.svg", - "Rule-Based-Execution.png", - "Rule-Based-Execution.svg", - "Sentry-Gofer.png", - "Sentry-Gofer.svg", - ], - permalink = "/docs/architecture_guide/", - weight = "0", -) - doc( name = "platforms", src = "platforms.md", category = "Architecture Guide", data = [ - "Sentry-Gofer.png", - "Sentry-Gofer.svg", + "platforms.png", + "platforms.svg", ], permalink = "/docs/architecture_guide/platforms/", weight = "40", @@ -39,6 +21,10 @@ doc( name = "resources", src = "resources.md", category = "Architecture Guide", + data = [ + "resources.png", + "resources.svg", + ], permalink = "/docs/architecture_guide/resources/", weight = "30", ) @@ -48,8 +34,8 @@ doc( src = "security.md", category = "Architecture Guide", data = [ - "Layers.png", - "Layers.svg", + "security.png", + "security.svg", ], permalink = "/docs/architecture_guide/security/", weight = "10", diff --git a/g3doc/architecture_guide/Layers.png b/g3doc/architecture_guide/Layers.png deleted file mode 100644 index 308c6c451..000000000 Binary files a/g3doc/architecture_guide/Layers.png and /dev/null differ diff --git a/g3doc/architecture_guide/Layers.svg b/g3doc/architecture_guide/Layers.svg deleted file mode 100644 index 0a366f841..000000000 --- a/g3doc/architecture_guide/Layers.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/g3doc/architecture_guide/Machine-Virtualization.png b/g3doc/architecture_guide/Machine-Virtualization.png deleted file mode 100644 index 1ba2ed6b2..000000000 Binary files a/g3doc/architecture_guide/Machine-Virtualization.png and /dev/null differ diff --git a/g3doc/architecture_guide/Machine-Virtualization.svg b/g3doc/architecture_guide/Machine-Virtualization.svg deleted file mode 100644 index 5352da07b..000000000 --- a/g3doc/architecture_guide/Machine-Virtualization.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/g3doc/architecture_guide/README.md b/g3doc/architecture_guide/README.md deleted file mode 100644 index ab9ef7174..000000000 --- a/g3doc/architecture_guide/README.md +++ /dev/null @@ -1,83 +0,0 @@ -# Overview - -gVisor provides a virtualized environment in order to sandbox untrusted -containers. The system interfaces normally implemented by the host kernel are -moved into a distinct, per-sandbox user space kernel in order to minimize the -risk of an exploit. gVisor does not introduce large fixed overheads however, and -still retains a process-like model with respect to resource utilization. - -## How is this different? - -Two other approaches are commonly taken to provide stronger isolation than -native containers. - -**Machine-level virtualization**, such as [KVM][kvm] and [Xen][xen], exposes -virtualized hardware to a guest kernel via a Virtual Machine Monitor (VMM). This -virtualized hardware is generally enlightened (paravirtualized) and additional -mechanisms can be used to improve the visibility between the guest and host -(e.g. balloon drivers, paravirtualized spinlocks). Running containers in -distinct virtual machines can provide great isolation, compatibility and -performance (though nested virtualization may bring challenges in this area), -but for containers it often requires additional proxies and agents, and may -require a larger resource footprint and slower start-up times. - -![Machine-level virtualization](Machine-Virtualization.png "Machine-level virtualization") - -**Rule-based execution**, such as [seccomp][seccomp], [SELinux][selinux] and -[AppArmor][apparmor], allows the specification of a fine-grained security policy -for an application or container. These schemes typically rely on hooks -implemented inside the host kernel to enforce the rules. If the surface can be -made small enough (i.e. a sufficiently complete policy defined), then this is an -excellent way to sandbox applications and maintain native performance. However, -in practice it can be extremely difficult (if not impossible) to reliably define -a policy for arbitrary, previously unknown applications, making this approach -challenging to apply universally. - -![Rule-based execution](Rule-Based-Execution.png "Rule-based execution") - -Rule-based execution is often combined with additional layers for -defense-in-depth. - -**gVisor** provides a third isolation mechanism, distinct from those above. - -gVisor intercepts application system calls and acts as the guest kernel, without -the need for translation through virtualized hardware. gVisor may be thought of -as either a merged guest kernel and VMM, or as seccomp on steroids. This -architecture allows it to provide a flexible resource footprint (i.e. one based -on threads and memory mappings, not fixed guest physical resources) while also -lowering the fixed costs of virtualization. However, this comes at the price of -reduced application compatibility and higher per-system call overhead. - -![gVisor](Layers.png "gVisor") - -On top of this, gVisor employs rule-based execution to provide defense-in-depth -(details below). - -gVisor's approach is similar to [User Mode Linux (UML)][uml], although UML -virtualizes hardware internally and thus provides a fixed resource footprint. - -Each of the above approaches may excel in distinct scenarios. For example, -machine-level virtualization will face challenges achieving high density, while -gVisor may provide poor performance for system call heavy workloads. - -### Why Go? - -gVisor is written in [Go][golang] in order to avoid security pitfalls that can -plague kernels. With Go, there are strong types, built-in bounds checks, no -uninitialized variables, no use-after-free, no stack overflow, and a built-in -race detector. (The use of Go has its challenges too, and isn't free.) - -### What about Gofers? - - - -Gofers mediate file system interactions, and are used to provide additional -isolation. For more details, see the [Platform Guide](./platforms.md). - -[apparmor]: https://wiki.ubuntu.com/AppArmor -[golang]: https://golang.org -[kvm]: https://www.linux-kvm.org -[seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt -[selinux]: https://selinuxproject.org -[uml]: http://user-mode-linux.sourceforge.net/ -[xen]: https://www.xenproject.org diff --git a/g3doc/architecture_guide/Rule-Based-Execution.png b/g3doc/architecture_guide/Rule-Based-Execution.png deleted file mode 100644 index b42654a90..000000000 Binary files a/g3doc/architecture_guide/Rule-Based-Execution.png and /dev/null differ diff --git a/g3doc/architecture_guide/Rule-Based-Execution.svg b/g3doc/architecture_guide/Rule-Based-Execution.svg deleted file mode 100644 index bd6717043..000000000 --- a/g3doc/architecture_guide/Rule-Based-Execution.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/g3doc/architecture_guide/Sentry-Gofer.png b/g3doc/architecture_guide/Sentry-Gofer.png deleted file mode 100644 index ca2c27ef7..000000000 Binary files a/g3doc/architecture_guide/Sentry-Gofer.png and /dev/null differ diff --git a/g3doc/architecture_guide/Sentry-Gofer.svg b/g3doc/architecture_guide/Sentry-Gofer.svg deleted file mode 100644 index 5c10750d2..000000000 --- a/g3doc/architecture_guide/Sentry-Gofer.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/g3doc/architecture_guide/performance.md b/g3doc/architecture_guide/performance.md index 3862d78ee..39dbb0045 100644 --- a/g3doc/architecture_guide/performance.md +++ b/g3doc/architecture_guide/performance.md @@ -13,12 +13,13 @@ forms: additional cycles and memory usage, which may manifest as increased latency, reduced throughput or density, or not at all. In general, these costs come from two different sources. -First, the existence of the [Sentry](../) means that additional memory will be -required, and application system calls must traverse additional layers of -software. The design emphasizes [security](../security/) and therefore we chose -to use a language for the Sentry that provides benefits in this domain but may -not yet offer the raw performance of other choices. Costs imposed by these -design choices are **structural costs**. +First, the existence of the [Sentry](../README.md#sentry) means that additional +memory will be required, and application system calls must traverse additional +layers of software. The design emphasizes +[security](/docs/architecture_guide/security/) and therefore we chose to use a +language for the Sentry that provides benefits in this domain but may not yet +offer the raw performance of other choices. Costs imposed by these design +choices are **structural costs**. Second, as gVisor is an independent implementation of the system call surface, many of the subsystems or specific calls are not as optimized as more mature @@ -50,7 +51,7 @@ Virtual Machines (VMs) with the following specifications: Through this document, `runsc` is used to indicate the runtime provided by gVisor. When relevant, we use the name `runsc-platform` to describe a specific -[platform choice](../platforms/). +[platform choice](/docs/architecture_guide/platforms/). **Except where specified, all tests below are conducted with the `ptrace` platform. The `ptrace` platform works everywhere and does not require hardware @@ -131,11 +132,11 @@ full start-up and run time for the workload, which trains a model. ## System calls Some **structural costs** of gVisor are heavily influenced by the -[platform choice](../platforms/), which implements system call interception. -Today, gVisor supports a variety of platforms. These platforms present distinct -performance, compatibility and security trade-offs. For example, the KVM -platform has low overhead system call interception but runs poorly with nested -virtualization. +[platform choice](/docs/architecture_guide/platforms/), which implements system +call interception. Today, gVisor supports a variety of platforms. These +platforms present distinct performance, compatibility and security trade-offs. +For example, the KVM platform has low overhead system call interception but runs +poorly with nested virtualization. {% include graph.html id="syscall" url="/performance/syscall.csv" title="perf.py syscall --runtime=runc --runtime=runsc-ptrace --runtime=runsc-kvm" y_min="100" @@ -163,7 +164,8 @@ overhead. Some of these costs above are **structural costs**, and `redis` is likely to remain a challenging performance scenario. However, optimizing the -[platform](../platforms/) will also have a dramatic impact. +[platform](/docs/architecture_guide/platforms/) will also have a dramatic +impact. ## Start-up time @@ -184,7 +186,7 @@ similarly loads a number of modules and binds an HTTP server. > Note: most of the time overhead above is associated Docker itself. This is > evident with the empty `runc` benchmark. To avoid these costs with `runsc`, > you may also consider using `runsc do` mode or invoking the -> [OCI runtime](../../user_guide/quick_start/oci/) directly. +> [OCI runtime](../user_guide/quick_start/oci.md) directly. ## Network @@ -222,8 +224,9 @@ In terms of raw disk I/O, gVisor does not introduce significant fundamental overhead. For general file operations, gVisor introduces a small fixed overhead for data that transitions across the sandbox boundary. This manifests as **structural costs** in some cases, since these operations must be routed -through the [Gofer](../) as a result of our [security model](../security/), but -in most cases are dominated by **implementation costs**, due to an internal +through the [Gofer](../README.md#gofer) as a result of our +[Security Model](/docs/architecture_guide/security/), but in most cases are +dominated by **implementation costs**, due to an internal [Virtual File System][vfs] (VFS) implementation that needs improvement. {% include graph.html id="fio-bw" url="/performance/fio.csv" title="perf.py fio diff --git a/g3doc/architecture_guide/platforms.md b/g3doc/architecture_guide/platforms.md index 6e63da8ce..d112c9a28 100644 --- a/g3doc/architecture_guide/platforms.md +++ b/g3doc/architecture_guide/platforms.md @@ -1,86 +1,61 @@ # Platform Guide -A gVisor sandbox consists of multiple processes when running. These processes -collectively comprise a shared environment in which one or more containers can -be run. +[TOC] -Each sandbox has its own isolated instance of: - -* The **Sentry**, A user-space kernel that runs the container and intercepts - and responds to system calls made by the application. - -Each container running in the sandbox has its own isolated instance of: - -* A **Gofer** which provides file system access to the container. - -![gVisor architecture diagram](Sentry-Gofer.png "gVisor architecture diagram") - -## runsc - -The entrypoint to running a sandboxed container is the `runsc` executable. -`runsc` implements the [Open Container Initiative (OCI)][oci] runtime -specification. This means that OCI compatible _filesystem bundles_ can be run by -`runsc`. Filesystem bundles are comprised of a `config.json` file containing -container configuration, and a root filesystem for the container. Please see the -[OCI runtime spec][runtime-spec] for more information on filesystem bundles. -`runsc` implements multiple commands that perform various functions such as -starting, stopping, listing, and querying the status of containers. +gVisor requires a platform to implement interception of syscalls, basic context +switching, and memory mapping functionality. Internally, gVisor uses an +abstraction sensibly called [Platform][platform]. A simplified version of this +interface looks like: -## Sentry +```golang +type Platform interface { + NewAddressSpace() (AddressSpace, error) + NewContext() Context +} -The Sentry is the largest component of gVisor. It can be thought of as a -userspace OS kernel. The Sentry implements all the kernel functionality needed -by the untrusted application. It implements all of the supported system calls, -signal delivery, memory management and page faulting logic, the threading model, -and more. +type Context interface { + Switch(as AddressSpace, ac arch.Context) (..., error) +} -When the untrusted application makes a system call, the currently used platform -redirects the call to the Sentry, which will do the necessary work to service -it. It is important to note that the Sentry will not simply pass through system -calls to the host kernel. As a userspace application, the Sentry will make some -host system calls to support its operation, but it will not allow the -application to directly control the system calls it makes. +type AddressSpace interface { + MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, ...) error + Unmap(addr usermem.Addr, length uint64) +} +``` -The Sentry aims to present an equivalent environment to (upstream) Linux v4.4. +There are a number of different ways to implement this interface that come with +various trade-offs, generally around performance and hardware requirements. -File system operations that extend beyond the sandbox (not internal /proc files, -pipes, etc) are sent to the Gofer, described below. +## Implementations -## Platforms +The choice of platform depends on the context in which `runsc` is executing. In +general, virtualized platforms may be limited to platforms that do not require +hardware virtualized support (since the hardware is already in use): -gVisor requires a platform to implement interception of syscalls, basic context -switching, and memory mapping functionality. +![Platforms](platforms.png "Platform examples.") ### ptrace -The ptrace platform uses `PTRACE_SYSEMU` to execute user code without allowing -it to execute host system calls. This platform can run anywhere that ptrace -works (even VMs without nested virtualization). - -### KVM (experimental) +The ptrace platform uses [PTRACE_SYSEMU][ptrace] to execute user code without +allowing it to execute host system calls. This platform can run anywhere that +`ptrace` works (even VMs without nested virtualization), which is ubiquitous. -The KVM platform allows the Sentry to act as both guest OS and VMM, switching -back and forth between the two worlds seamlessly. The KVM platform can run on -bare-metal or in a VM with nested virtualization enabled. While there is no -virtualized hardware layer -- the sandbox retains a process model -- gVisor -leverages virtualization extensions available on modern processors in order to -improve isolation and performance of address space switches. +Unfortunately, the ptrace platform has high context switch overhead, so system +call-heavy applications may pay a [performance penalty](./performance.md). -## Gofer +### KVM -The Gofer is a normal host Linux process. The Gofer is started with each sandbox -and connected to the Sentry. The Sentry process is started in a restricted -seccomp container without access to file system resources. The Gofer provides -the Sentry access to file system resources via the 9P protocol and provides an -additional level of isolation. +The KVM platform uses the kernel's [KVM][kvm] functionality to allow the Sentry +to act as both guest OS and VMM. The KVM platform can run on bare-metal or in a +VM with nested virtualization enabled. While there is no virtualized hardware +layer -- the sandbox retains a process model -- gVisor leverages virtualization +extensions available on modern processors in order to improve isolation and +performance of address space switches. -## Application +## Changing Platforms -The application (aka the untrusted application) is a normal Linux binary -provided to gVisor in an OCI runtime bundle. gVisor aims to provide an -environment equivalent to Linux v4.4, so applications should be able to run -unmodified. However, gVisor does not presently implement every system call, -/proc file, or /sys file so some incompatibilities may occur. +See [Changing Platforms](../user_guide/platforms.md). -[oci]: https://www.opencontainers.org -[runtime-spec]: https://github.com/opencontainers/runtime-spec +[kvm]: https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt +[platform]: https://cs.opensource.google/gvisor/gvisor/+/release-20190304.1:pkg/sentry/platform/platform.go;l=33 +[ptrace]: http://man7.org/linux/man-pages/man2/ptrace.2.html diff --git a/g3doc/architecture_guide/platforms.png b/g3doc/architecture_guide/platforms.png new file mode 100644 index 000000000..005d56feb Binary files /dev/null and b/g3doc/architecture_guide/platforms.png differ diff --git a/g3doc/architecture_guide/platforms.svg b/g3doc/architecture_guide/platforms.svg new file mode 100644 index 000000000..b0bac9ba7 --- /dev/null +++ b/g3doc/architecture_guide/platforms.svg @@ -0,0 +1,334 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + gVisor + workload + host + + + + gVisor + workload + + KVM + + + gVisor + workload + + ptrace + + ptrace + VM + + guest + + + gVisor + workload + + ptrace + + diff --git a/g3doc/architecture_guide/resources.md b/g3doc/architecture_guide/resources.md index 894f995ae..1dec37bd1 100644 --- a/g3doc/architecture_guide/resources.md +++ b/g3doc/architecture_guide/resources.md @@ -10,9 +10,10 @@ sandbox to be highly dynamic in terms of resource usage: spanning a large number of cores and large amount of memory when busy, and yielding those resources back to the host when not. -Some of the details here may depend on the [platform](../platforms/), but in -general this page describes the resource model used by gVisor. If you're not -familiar with the terms here, uou may want to start with the [Overview](../). +In order words, the shape of the sandbox should closely track the shape of the +sandboxed process: + +![Resource model](resources.png "Workloads of different shapes.") ## Processes @@ -23,9 +24,9 @@ the sandbox (e.g. via a [Docker exec][exec]). ## Networking -Similarly to processes, the sandbox attaches a network endpoint to the system, -but runs it's own network stack. All network resources, other than packets in -flight, exist only inside the sandbox, bound by relevant resource limits. +The sandbox attaches a network endpoint to the system, but runs it's own network +stack. All network resources, other than packets in flight on the host, exist +only inside the sandbox, bound by relevant resource limits. You can interact with network endpoints exposed by the sandbox, just as you would any other container, but network introspection similarly requires entering @@ -33,15 +34,14 @@ the sandbox. ## Files -Files may be backed by different implementations. For host-native files (where a -file descriptor is available), the Gofer may return a file descriptor to the -Sentry via [SCM_RIGHTS][scmrights][^1]. +Files in the sandbox may be backed by different implementations. For host-native +files (where a file descriptor is available), the Gofer may return a file +descriptor to the Sentry via [SCM_RIGHTS][scmrights][^1]. These files may be read from and written to through standard system calls, and also mapped into the associated application's address space. This allows the same host memory to be shared across multiple sandboxes, although this mechanism -does not preclude the use of side-channels (see the -[security model](../security/)). +does not preclude the use of side-channels (see [Security Model](./security.md). Note that some file systems exist only within the context of the sandbox. For example, in many cases a `tmpfs` mount will be available at `/tmp` or @@ -64,8 +64,9 @@ scheduling decisions about all application threads. ## Time Time in the sandbox is provided by the Sentry, through its own [vDSO][vdso] and -timekeeping implementation. This is divorced from the host time, and no state is -shared with the host, although the time will be initialized with the host clock. +time-keeping implementation. This is distinct from the host time, and no state +is shared with the host, although the time will be initialized with the host +clock. The Sentry runs timers to note the passage of time, much like a kernel running on hardware (though the timers are software timers, in this case). These timers diff --git a/g3doc/architecture_guide/resources.png b/g3doc/architecture_guide/resources.png new file mode 100644 index 000000000..f715008ec Binary files /dev/null and b/g3doc/architecture_guide/resources.png differ diff --git a/g3doc/architecture_guide/resources.svg b/g3doc/architecture_guide/resources.svg new file mode 100644 index 000000000..fd7805d90 --- /dev/null +++ b/g3doc/architecture_guide/resources.svg @@ -0,0 +1,208 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + gVisor + gVisor + gVisor + workload + workload + workload + host + + diff --git a/g3doc/architecture_guide/security.md b/g3doc/architecture_guide/security.md index f78586291..b99b86332 100644 --- a/g3doc/architecture_guide/security.md +++ b/g3doc/architecture_guide/security.md @@ -86,15 +86,17 @@ a substitute for a secure architecture*. ## Goals: Limiting Exposure -gVisor’s primary design goal is to minimize the System API attack vector while -still providing a process model. There are two primary security principles that -inform this design. First, the application’s direct interactions with the host -System API are intercepted by the Sentry, which implements the System API -instead. Second, the System API accessible to the Sentry itself is minimized to -a safer, restricted set. The first principle minimizes the possibility of direct -exploitation of the host System API by applications, and the second principle -minimizes indirect exploitability, which is the exploitation by an exploited or -buggy Sentry (e.g. chaining an exploit). +![Threat model](security.png "Threat model.") + +gVisor’s primary design goal is to minimize the System API attack vector through +multiple layers of defense, while still providing a process model. There are two +primary security principles that inform this design. First, the application’s +direct interactions with the host System API are intercepted by the Sentry, +which implements the System API instead. Second, the System API accessible to +the Sentry itself is minimized to a safer, restricted set. The first principle +minimizes the possibility of direct exploitation of the host System API by +applications, and the second principle minimizes indirect exploitability, which +is the exploitation by an exploited or buggy Sentry (e.g. chaining an exploit). The first principle is similar to the security basis for a Virtual Machine (VM). With a VM, an application’s interactions with the host are replaced by @@ -210,9 +212,9 @@ crashes are recorded and triaged to similarly identify material issues. ### Is this more or less secure than a Virtual Machine? The security of a VM depends to a large extent on what is exposed from the host -kernel and user space support code. For example, device emulation code in the +kernel and userspace support code. For example, device emulation code in the host kernel (e.g. APIC) or optimizations (e.g. vhost) can be more complex than a -simple system call, and exploits carry the same risks. Similarly, the user space +simple system call, and exploits carry the same risks. Similarly, the userspace support code is frequently unsandboxed, and exploits, while rare, may allow unfettered access to the system. @@ -245,8 +247,8 @@ In gVisor, the platforms that use ptrace operate differently. The stubs that are traced are never allowed to continue execution into the host kernel and complete a call directly. Instead, all system calls are interpreted and handled by the Sentry itself, who reflects resulting register state back into the tracee before -continuing execution in user space. This is very similar to the mechanism used -by User-Mode Linux (UML). +continuing execution in userspace. This is very similar to the mechanism used by +User-Mode Linux (UML). [dirtycow]: https://en.wikipedia.org/wiki/Dirty_COW [clang]: https://en.wikipedia.org/wiki/C_(programming_language) diff --git a/g3doc/architecture_guide/security.png b/g3doc/architecture_guide/security.png new file mode 100644 index 000000000..c29befbf6 Binary files /dev/null and b/g3doc/architecture_guide/security.png differ diff --git a/g3doc/architecture_guide/security.svg b/g3doc/architecture_guide/security.svg new file mode 100644 index 000000000..0575e2dec --- /dev/null +++ b/g3doc/architecture_guide/security.svg @@ -0,0 +1,153 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/g3doc/user_guide/filesystem.md b/g3doc/user_guide/filesystem.md index 6c69f42a1..cd00762dd 100644 --- a/g3doc/user_guide/filesystem.md +++ b/g3doc/user_guide/filesystem.md @@ -4,8 +4,8 @@ gVisor accesses the filesystem through a file proxy, called the Gofer. The gofer runs as a separate process, that is isolated from the sandbox. Gofer instances -communicate with their respective sentry using the 9P protocol. For a more -detailed explanation see [Overview > Gofer](../../architecture_guide/#gofer). +communicate with their respective sentry using the 9P protocol. For another +explanation see [What is gVisor?](../README.md). ## Sandbox overlay diff --git a/g3doc/user_guide/platforms.md b/g3doc/user_guide/platforms.md index eefb6b222..752025881 100644 --- a/g3doc/user_guide/platforms.md +++ b/g3doc/user_guide/platforms.md @@ -1,56 +1,27 @@ -# Platforms (KVM) +# Changing Platforms [TOC] -This document will help you set up your system to use a different gVisor -platform. +This guide described how to change the +[platform](../architecture_guide/platforms.md) used by `runsc`. -## What is a Platform? +## Prerequisites -gVisor requires a *platform* to implement interception of syscalls, basic -context switching, and memory mapping functionality. These are described in more -depth in the [Platform Design](../../architecture_guide/platforms/). +If you intend to run the KVM platform, you will also to have KVM installed on +your system. If you are running a Debian based system like Debian or Ubuntu you +can usually do this by ensuring the module is loaded, and permissions are +appropriately set on the `/dev/kvm` device. -## Selecting a Platform - -The platform is selected by the `--platform` command line flag passed to -`runsc`. By default, the ptrace platform is selected. To select a different -platform, modify your Docker configuration (`/etc/docker/daemon.json`) to pass -this argument: - -```json -{ - "runtimes": { - "runsc": { - "path": "/usr/local/bin/runsc", - "runtimeArgs": [ - "--platform=kvm" - ] - } - } -} -``` - -You must restart the Docker daemon after making changes to this file, typically -this is done via `systemd`: +If you have an Intel CPU: ```bash -sudo systemctl restart docker +sudo modprobe kvm-intel && sudo chmod a+rw /dev/kvm ``` -## Example: Using the KVM Platform - -The KVM platform is currently experimental; however, it provides several -benefits over the default ptrace platform. - -### Prerequisites - -You will also to have KVM installed on your system. If you are running a Debian -based system like Debian or Ubuntu you can usually do this by installing the -`qemu-kvm` package. +If you have an AMD CPU: ```bash -sudo apt-get install qemu-kvm +sudo modprobe kvm-amd && sudo chmod a+rw /dev/kvm ``` If you are using a virtual machine you will need to make sure that nested @@ -68,31 +39,22 @@ cause of security issues (e.g. [CVE-2018-12904](https://nvd.nist.gov/vuln/detail/CVE-2018-12904)). It is not recommended for production.*** -### Configuring Docker - -Per above, you will need to configure Docker to use `runsc` with the KVM -platform. You will remember from the Docker Quick Start that you configured -Docker to use `runsc` as the runtime. Docker allows you to add multiple runtimes -to the Docker configuration. +## Configuring Docker -Add a new entry for the KVM platform entry to your Docker configuration -(`/etc/docker/daemon.json`) in order to provide the `--platform=kvm` runtime -argument. - -In the end, the file should look something like: +The platform is selected by the `--platform` command line flag passed to +`runsc`. By default, the ptrace platform is selected. For example, to select the +KVM platform, modify your Docker configuration (`/etc/docker/daemon.json`) to +pass the `--platform` argument: ```json { "runtimes": { "runsc": { - "path": "/usr/local/bin/runsc" - }, - "runsc-kvm": { "path": "/usr/local/bin/runsc", "runtimeArgs": [ "--platform=kvm" ] - } + } } } ``` @@ -104,13 +66,27 @@ this is done via `systemd`: sudo systemctl restart docker ``` -## Running a container +Note that you may configure multiple runtimes using different platforms. For +example, the following configuration has one configuration for ptrace and one +for the KVM platform: -Now run your container using the `runsc-kvm` runtime. This will run the -container using the KVM platform: - -```bash -docker run --runtime=runsc-kvm --rm hello-world +```json +{ + "runtimes": { + "runsc-ptrace": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--platform=ptrace" + ] + }, + "runsc-kvm": { + "path": "/usr/local/bin/runsc", + "runtimeArgs": [ + "--platform=kvm" + ] + } + } +} ``` [nested-azure]: https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nested-virtualization diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go index 92d062513..95dfd1e90 100644 --- a/pkg/sentry/arch/syscalls_arm64.go +++ b/pkg/sentry/arch/syscalls_arm64.go @@ -23,7 +23,7 @@ const restartSyscallNr = uintptr(128) // // In linux, at the entry of the syscall handler(el0_svc_common()), value of R0 // is saved to the pt_regs.orig_x0 in kernel code. But currently, the orig_x0 -// was not accessible to the user space application, so we have to do the same +// was not accessible to the userspace application, so we have to do the same // operation in the sentry code to save the R0 value into the App context. func (c *context64) SyscallSaveOrig() { c.OrigR0 = c.Regs.Regs[0] diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index 5a1d4fd57..aacf28da2 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -144,7 +144,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume if v > math.MaxInt32 { v = math.MaxInt32 // Silently truncate. } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index c9db78e06..a5903b0b5 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -199,10 +199,10 @@ func (t *Task) doSyscall() taskRunState { // // On x86, register rax was shared by syscall number and return // value, and at the entry of the syscall handler, the rax was - // saved to regs.orig_rax which was exposed to user space. + // saved to regs.orig_rax which was exposed to userspace. // But on arm64, syscall number was passed through X8, and the X0 // was shared by the first syscall argument and return value. The - // X0 was saved to regs.orig_x0 which was not exposed to user space. + // X0 was saved to regs.orig_x0 which was not exposed to userspace. // So we have to do the same operation here to save the X0 value // into the task context. t.Arch().SyscallSaveOrig() diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 9dea2b5ff..60df51dae 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -2718,7 +2718,7 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) @@ -2787,7 +2787,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc if v > math.MaxInt32 { v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) @@ -2803,7 +2803,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index b39ffa9fb..0ab4c3e19 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -235,11 +235,11 @@ type RcvBufAutoTuneParams struct { // was started. MeasureTime time.Time - // CopiedBytes is the number of bytes copied to user space since + // CopiedBytes is the number of bytes copied to userspace since // this measure began. CopiedBytes int - // PrevCopiedBytes is the number of bytes copied to user space in + // PrevCopiedBytes is the number of bytes copied to userspace in // the previous RTT period. PrevCopiedBytes int diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 71735029e..b5ba972f1 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1097,7 +1097,7 @@ func (e *endpoint) initialReceiveWindow() int { } // ModerateRecvBuf adjusts the receive buffer and the advertised window -// based on the number of bytes copied to user space. +// based on the number of bytes copied to userspace. func (e *endpoint) ModerateRecvBuf(copied int) { e.LockUser() defer e.UnlockUser() diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 0b4512c65..6ef32a1b3 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -5869,7 +5869,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) { // Invoke the moderation API. This is required for auto-tuning // to happen. This method is normally expected to be invoked // from a higher layer than tcpip.Endpoint. So we simulate - // copying to user-space by invoking it explicitly here. + // copying to userspace by invoking it explicitly here. c.EP.ModerateRecvBuf(totalCopied) // Now send a keep-alive packet to trigger an ACK so that we can diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go index c7d210140..cd85dabbb 100644 --- a/runsc/cmd/help.go +++ b/runsc/cmd/help.go @@ -65,16 +65,10 @@ func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{} switch f.NArg() { case 0: fmt.Fprintf(h.cdr.Output, "Usage: %s \n\n", h.cdr.Name()) - fmt.Fprintf(h.cdr.Output, `runsc is a command line client for running applications packaged in the Open -Container Initiative (OCI) format. Applications run by runsc are run in an -isolated gVisor sandbox that emulates a Linux environment. + fmt.Fprintf(h.cdr.Output, `runsc is the gVisor container runtime. -gVisor is a user-space kernel, written in Go, that implements a substantial -portion of the Linux system call interface. It provides an additional layer -of isolation between running applications and the host operating system. - -Functionality is provided by subcommands. For additonal help on individual -subcommands use "%s %s ". +Functionality is provided by subcommands. For help with a specific subcommand, +use "%s %s ". `, h.cdr.Name(), h.Name()) h.cdr.VisitGroups(func(g *subcommands.CommandGroup) { diff --git a/website/BUILD b/website/BUILD index d6afd5f44..c97b2560b 100644 --- a/website/BUILD +++ b/website/BUILD @@ -138,7 +138,6 @@ docs( "//g3doc:community", "//g3doc:index", "//g3doc:roadmap", - "//g3doc/architecture_guide:index", "//g3doc/architecture_guide:performance", "//g3doc/architecture_guide:platforms", "//g3doc/architecture_guide:resources", diff --git a/website/_layouts/docs.html b/website/_layouts/docs.html index 33ea8e1de..549305089 100644 --- a/website/_layouts/docs.html +++ b/website/_layouts/docs.html @@ -51,7 +51,9 @@ categories: Create issue

{% endif %} +
{{ content }} +
diff --git a/website/_sass/front.scss b/website/_sass/front.scss index 44a7e3473..0e4208f3c 100644 --- a/website/_sass/front.scss +++ b/website/_sass/front.scss @@ -4,12 +4,14 @@ background-repeat: no-repeat; background-size: cover; background-blend-mode: darken; - background-color: rgba(0, 0, 0, 0.1); + background-color: rgba(0, 0, 0, 0.3); p { color: #fff; margin-top: 0; margin-bottom: 0; font-weight: 300; + font-size: 24px; + line-height: 30px; } } diff --git a/website/_sass/style.scss b/website/_sass/style.scss index 520ea469a..4deb945d4 100644 --- a/website/_sass/style.scss +++ b/website/_sass/style.scss @@ -142,3 +142,13 @@ table th { margin-top: 10px; margin-bottom: 20px; } + +.docs-content * img { + display: block; + margin: 20px auto; +} + +.blog-content * img { + display: block; + margin: 20px auto; +} diff --git a/website/blog/2019-11-18-security-basics.md b/website/blog/2019-11-18-security-basics.md index ed6d97ffe..fbdd511dd 100644 --- a/website/blog/2019-11-18-security-basics.md +++ b/website/blog/2019-11-18-security-basics.md @@ -56,15 +56,9 @@ in combination: redundant walls, scattered draw bridges, small bottle-neck entrances, moats, etc. A simplified version of the design is below -([more detailed version](/docs/architecture_guide/))[^2]: +([more detailed version](/docs/))[^2]: --------------------------------------------------------------------------------- - -![Figure 1](/assets/images/2019-11-18-security-basics-figure1.png) - -Figure 1: Simplified design of gVisor. - --------------------------------------------------------------------------------- +![Figure 1](/assets/images/2019-11-18-security-basics-figure1.png "Simplified design of gVisor.") In order to discuss design principles, the following components are important to know: @@ -134,13 +128,7 @@ minimum level of permission is required for it to perform its function. Specifically, the closer you are to the untrusted application, the less privilege you have. --------------------------------------------------------------------------------- - -![Figure 2](/assets/images/2019-11-18-security-basics-figure2.png) - -Figure 2: runsc components and their privileges. - --------------------------------------------------------------------------------- +![Figure 2](/assets/images/2019-11-18-security-basics-figure2.png "runsc components and their privileges.") This is evident in how runsc (the drop in gVisor binary for Docker/Kubernetes) constructs the sandbox. The Sentry has the least privilege possible (it can't @@ -222,15 +210,7 @@ the host Linux syscalls. In other words, with gVisor, applications get the vast majority (and growing) functionality of Linux containers for only 68 possible syscalls to the Host OS. 350 syscalls to 68 is attack surface reduction. --------------------------------------------------------------------------------- - -![Figure 3](/assets/images/2019-11-18-security-basics-figure3.png) - -Figure 3: Reduction of Attack Surface of the Syscall Table. Note that the -Senty's Syscall Emulation Layer keeps the Containerized Process from ever -calling the Host OS. - --------------------------------------------------------------------------------- +![Figure 3](/assets/images/2019-11-18-security-basics-figure3.png "Reduction of Attack Surface of the Syscall Table. Note that the Senty's Syscall Emulation Layer keeps the Containerized Process from ever calling the Host OS.") ## Secure-by-default diff --git a/website/blog/2020-04-02-networking-security.md b/website/blog/2020-04-02-networking-security.md index 78f0a6714..5a5e38fd7 100644 --- a/website/blog/2020-04-02-networking-security.md +++ b/website/blog/2020-04-02-networking-security.md @@ -69,13 +69,7 @@ a similar syscall). Moreover, because packets typically come from off-host (e.g. the internet), the Host OS's packet processing code has received a lot of scrutiny, hopefully resulting in a high degree of hardening. --------------------------------------------------------------------------------- - -![Figure 1](/assets/images/2020-04-02-networking-security-figure1.png) - -Figure 1: Netstack and gVisor - --------------------------------------------------------------------------------- +![Figure 1](/assets/images/2020-04-02-networking-security-figure1.png "Network and gVisor.") ## Writing a network stack diff --git a/website/index.md b/website/index.md index 95d5d16f0..84f877d49 100644 --- a/website/index.md +++ b/website/index.md @@ -3,10 +3,10 @@
-

gVisor is an application kernel and container runtime providing defense-in-depth for containers anywhere.

+

gVisor is an application kernel for containers that provides efficient defense-in-depth anywhere.

+ Quick start  Learn More  - GitHub 

@@ -19,8 +19,8 @@

Container-native Security

-

By providing each container with its own userspace kernel, gVisor limits - the attack surface of the host. This protection does not limit +

By providing each container with its own application kernel, gVisor + limits the attack surface of the host. This protection does not limit functionality: gVisor runs unmodified binaries and integrates with container orchestration systems, such as Docker and Kubernetes, and supports features such as volumes and sidecars.

@@ -43,7 +43,7 @@ The pluggable platform architecture of gVisor allows it to run anywhere, enabling consistent security policies across multiple environments without having to rearchitect your infrastructure.

- Get Started » + Read More »
-- cgit v1.2.3 From 61e68798828b500f093c4c129cb87361c2fe4a4f Mon Sep 17 00:00:00 2001 From: Ian Lewis Date: Wed, 20 May 2020 16:04:16 -0700 Subject: Update the Docker quickstart to use 'runsc install' PiperOrigin-RevId: 312573487 --- g3doc/user_guide/install.md | 9 +++------ g3doc/user_guide/quick_start/docker.md | 36 +++++++++++++++++++--------------- 2 files changed, 23 insertions(+), 22 deletions(-) (limited to 'g3doc') diff --git a/g3doc/user_guide/install.md b/g3doc/user_guide/install.md index 0de2b9932..9afdd264d 100644 --- a/g3doc/user_guide/install.md +++ b/g3doc/user_guide/install.md @@ -150,11 +150,8 @@ users, and ensure it is executable by all users**, since `runsc` executes itself as user `nobody` to avoid unnecessary privileges. The `/usr/local/bin` directory is a good place to put the `runsc` binary. -After installation, the`runsc` binary comes with an `install` command that can -optionally automatically configure Docker: - -```bash -runsc install -``` +After installation, try out `runsc` by following the +[Docker Quick Start](./quick_start/docker.md) or +[OCI Quick Start](./quick_start/oci.md). [releases]: https://github.com/google/gvisor/releases diff --git a/g3doc/user_guide/quick_start/docker.md b/g3doc/user_guide/quick_start/docker.md index fa8b9076b..6ad594ecc 100644 --- a/g3doc/user_guide/quick_start/docker.md +++ b/g3doc/user_guide/quick_start/docker.md @@ -14,24 +14,28 @@ the next section and proceed straight to running a container. ## Configuring Docker First you will need to configure Docker to use `runsc` by adding a runtime entry -to your Docker configuration (`/etc/docker/daemon.json`). You may have to create -this file if it does not exist. Also, some Docker versions also require you to -[specify the `storage-driver` field][storage-driver]. - -In the end, the file should look something like: - -```json -{ - "runtimes": { - "runsc": { - "path": "/usr/local/bin/runsc" - } - } -} +to your Docker configuration (e.g. `/etc/docker/daemon.json`). The easiest way +to this is via the `runsc install` command. This will install a docker runtime +named "runsc" by default. + +```bash +sudo runsc install +``` + +You may also wish to install a runtime entry for debugging. The `runsc install` +command can accept options that will be passed to the runtime when it is invoked +by Docker. + +```bash +sudo runsc install --runtime runsc-debug -- \ + --debug \ + --debug-log=/tmp/runsc-debug.log \ + --strace \ + --log-packets ``` -You must restart the Docker daemon after making changes to this file, typically -this is done via `systemd`: +You must restart the Docker daemon after installing the runtime. Typically this +is done via `systemd`: ```bash sudo systemctl restart docker -- cgit v1.2.3 From be6e9bbf559a33d02e8307df94b032bcef8b6b5a Mon Sep 17 00:00:00 2001 From: Mikael Mello Date: Mon, 25 May 2020 18:33:49 -0300 Subject: Fix typo in Wordpress tutorial page --- g3doc/user_guide/tutorials/docker.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'g3doc') diff --git a/g3doc/user_guide/tutorials/docker.md b/g3doc/user_guide/tutorials/docker.md index c0a3db506..705560038 100644 --- a/g3doc/user_guide/tutorials/docker.md +++ b/g3doc/user_guide/tutorials/docker.md @@ -1,4 +1,4 @@ -# WorkPress with Docker +# WordPress with Docker This page shows you how to deploy a sample [WordPress][wordpress] site using [Docker][docker]. -- cgit v1.2.3 From bab3c36efb5486fdfbfa52d8baf810c7a7c7efd8 Mon Sep 17 00:00:00 2001 From: Ian Gudger Date: Fri, 26 Jun 2020 21:09:25 -0700 Subject: Add style guide. PiperOrigin-RevId: 318591900 --- CONTRIBUTING.md | 8 ++---- g3doc/style.md | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 6 deletions(-) create mode 100644 g3doc/style.md (limited to 'g3doc') diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3f8f4c985..89180eb3f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -37,10 +37,8 @@ Dependencies can be added by using `go mod get`. In order to keep the ### Coding Guidelines -All Go code should conform to the [Go style guidelines][gostyle]. C++ code -should conform to the [Google C++ Style Guide][cppstyle] and the guidelines -described for tests. Note that code may be automatically formatted per the -guidelines when merged. +All code should comply with the [style guide](g3doc/style.md). Note that code +may be automatically formatted per the guidelines when merged. As a secure runtime, we need to maintain the safety of all of code included in gVisor. The following rules help mitigate issues. @@ -125,9 +123,7 @@ Contributions made by corporations are covered by a different agreement than the one above, the [Software Grant and Corporate Contributor License Agreement][gccla]. -[cppstyle]: https://google.github.io/styleguide/cppguide.html [gcla]: https://cla.developers.google.com/about/google-individual [gccla]: https://cla.developers.google.com/about/google-corporate [github]: https://github.com/google/gvisor/compare [gvisor-dev-list]: https://groups.google.com/forum/#!forum/gvisor-dev -[gostyle]: https://github.com/golang/go/wiki/CodeReviewComments diff --git a/g3doc/style.md b/g3doc/style.md new file mode 100644 index 000000000..d10549fe9 --- /dev/null +++ b/g3doc/style.md @@ -0,0 +1,88 @@ +# Provisional style guide + +> These guidelines are new and may change. This note will be removed when +> consensus is reached. + +Not all existing code will comply with this style guide, but new code should. +Further, it is a goal to eventually update all existing code to be in +compliance. + +## All code + +### Early exit + +All code, unless it substantially increases the line count or complexity, should +use early exits from loops and functions where possible. + +## Go specific + +All Go code should comply with the [Go Code Review Comments][gostyle] and +[Effective Go][effective_go] guides, as well as the additional guidelines +described below. + +### Mutexes + +#### Naming + +Mutexes should be named mu or xxxMu. Mutexes as a general rule should not be +exported. Instead, export methods which use the mutexes to avoid leaky +abstractions. + +#### Location + +Mutexes should be sibling fields to the fields that they protect. Mutexes should +not be declared as global variables, instead use a struct (anonymous ok, but +naming conventions still apply). + +Mutexes should be ordered before the fields that they protect. + +#### Comments + +Mutexes should have a comment on their declaration explaining any ordering +requirements (or pointing to where this information can be found), if +applicable. There is no need for a comment explaining which fields are +protected. + +Each field or variable protected by a mutex should state as such in a comment on +the field or variable declaration. + +### Unused returns + +Unused returns should be explicitly ignored with underscores. If there is a +function which is commonly used without using its return(s), a wrapper function +should be declared which explicitly ignores the returns. That said, in many +cases, it may make sense for the wrapper to check the returns. + +### Formatting verbs + +Built-in types should use their associated verbs (e.g. %d for integral types), +but other types should use a %v variant, even if they implement fmt.Stringer. +The built-in `error` type should use %w when formatted with `fmt.Errorf`, but +only then. + +### Wrapping + +Comments should be wrapped at 80 columns with a 2 space tab size. + +Code does not need to be wrapped, but if wrapping would make it more readable, +it should be wrapped with each subcomponent of the thing being wrapped on its +own line. For example, if a struct is split between lines, each field should be +on its own line. + +#### Example + +```go +_ = exec.Cmd{ + Path: "/foo/bar", + Args: []string{"-baz"}, +} +``` + +## C++ specific + +C++ code should conform to the [Google C++ Style Guide][cppstyle] and the +guidelines described for tests. + +[cppstyle]: https://google.github.io/styleguide/cppguide.html +[gostyle]: https://github.com/golang/go/wiki/CodeReviewComments +[effective_go]: https://golang.org/doc/effective_go.html -- cgit v1.2.3 From dce2dfae046cdc29b2ac3b949a76d6b0ee2348a5 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 30 Jun 2020 11:24:10 -0700 Subject: Add build target for the provisional style guide. This includes the provisional style guide in the website and fixes the broken link from CONTRIBUTING.md. The style guide will be located under the "Community" category as it's related to contributing to the project. Also, add missing includes that were causing some presubmits to fail. PiperOrigin-RevId: 319061410 --- g3doc/BUILD | 9 +++++++++ test/syscalls/linux/fcntl.cc | 3 +++ website/BUILD | 1 + 3 files changed, 13 insertions(+) (limited to 'g3doc') diff --git a/g3doc/BUILD b/g3doc/BUILD index dbbf96204..c315d38be 100644 --- a/g3doc/BUILD +++ b/g3doc/BUILD @@ -33,3 +33,12 @@ doc( subcategory = "Community", weight = "95", ) + +doc( + name = "style", + src = "style.md", + category = "Project", + permalink = "/community/style_guide/", + subcategory = "Community", + weight = "10", +) diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index 9130618fa..4b9b4db99 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -18,7 +18,10 @@ #include #include +#include +#include #include +#include #include "gtest/gtest.h" #include "absl/base/macros.h" diff --git a/website/BUILD b/website/BUILD index c97b2560b..4488cb543 100644 --- a/website/BUILD +++ b/website/BUILD @@ -138,6 +138,7 @@ docs( "//g3doc:community", "//g3doc:index", "//g3doc:roadmap", + "//g3doc:style", "//g3doc/architecture_guide:performance", "//g3doc/architecture_guide:platforms", "//g3doc/architecture_guide:resources", -- cgit v1.2.3 From 068716ddf36f4dcb3d88e92b90774dcba2fe4db8 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Wed, 1 Jul 2020 08:51:57 -0700 Subject: Fix FAQ URL The existing gvisor.dev/faq link returns 404 because the full URL has mistakenly been capitalized. PiperOrigin-RevId: 319233173 --- g3doc/user_guide/BUILD | 2 +- website/_includes/footer-links.html | 2 +- website/cmd/server/main.go | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'g3doc') diff --git a/g3doc/user_guide/BUILD b/g3doc/user_guide/BUILD index 5568e1ba4..b69aee12c 100644 --- a/g3doc/user_guide/BUILD +++ b/g3doc/user_guide/BUILD @@ -33,7 +33,7 @@ doc( name = "FAQ", src = "FAQ.md", category = "User Guide", - permalink = "/docs/user_guide/FAQ/", + permalink = "/docs/user_guide/faq/", weight = "90", ) diff --git a/website/_includes/footer-links.html b/website/_includes/footer-links.html index 10c28ead4..2036dbaa9 100644 --- a/website/_includes/footer-links.html +++ b/website/_includes/footer-links.html @@ -15,7 +15,7 @@
diff --git a/website/cmd/server/main.go b/website/cmd/server/main.go index 7c8bc9bfa..c401b6abd 100644 --- a/website/cmd/server/main.go +++ b/website/cmd/server/main.go @@ -35,6 +35,10 @@ var redirects = map[string]string{ // For links. "/faq": "/docs/user_guide/faq/", + // From 2020-05-12 to 2020-06-30, the FAQ URL was uppercase. Redirect that + // back to maintain any links. + "/docs/user_guide/FAQ/": "/docs/user_guide/faq/", + // Redirects to compatibility docs. "/c": "/docs/user_guide/compatibility/", "/c/linux/amd64": "/docs/user_guide/compatibility/linux/amd64/", -- cgit v1.2.3 From c0ea7d9e9e6403db4718eac79ac480ef55b9073c Mon Sep 17 00:00:00 2001 From: Wietse Venema Date: Tue, 7 Jul 2020 16:32:15 +0200 Subject: README.md: Commpatibility > Compatibility --- g3doc/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'g3doc') diff --git a/g3doc/README.md b/g3doc/README.md index 304a91493..7956fe739 100644 --- a/g3doc/README.md +++ b/g3doc/README.md @@ -152,7 +152,7 @@ The application is a normal Linux binary provided to gVisor in an OCI runtime bundle. gVisor aims to provide an environment equivalent to Linux v4.4, so applications should be able to run unmodified. However, gVisor does not presently implement every system call, `/proc` file, or `/sys` file so some -incompatibilities may occur. See [Commpatibility](./user_guide/compatibility.md) +incompatibilities may occur. See [Compatibility](./user_guide/compatibility.md) for more information. [9p]: https://en.wikipedia.org/wiki/9P_(protocol) -- cgit v1.2.3 From 8d2910a04dec5ef2c79034d35fce68e9f414d144 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Thu, 9 Jul 2020 09:03:14 -0700 Subject: Explain how to bypass the Docker proxy Neither myself nor bhaskerh@ can consistently remember how to do this. PiperOrigin-RevId: 320407005 --- g3doc/user_guide/debugging.md | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'g3doc') diff --git a/g3doc/user_guide/debugging.md b/g3doc/user_guide/debugging.md index 0525fd5c0..54fdce34f 100644 --- a/g3doc/user_guide/debugging.md +++ b/g3doc/user_guide/debugging.md @@ -129,3 +129,13 @@ go tool pprof -top /usr/local/bin/runsc /tmp/cpu.prof ``` [pprof]: https://github.com/google/pprof/blob/master/doc/README.md + +### Docker Proxy + +When forwarding a port to the container, Docker will likely route traffic +through the [docker-proxy][]. This proxy may make profiling noisy, so it can be +helpful to bypass it. Do so by sending traffic directly to the container IP and +port. e.g., if the `docker0` IP is `192.168.9.1`, the container IP is likely a +subsequent IP, such as `192.168.9.2`. + +[docker-proxy]: https://windsock.io/the-docker-proxy/ -- cgit v1.2.3 From e506fcd9314d3402c9e83974e9cc335348b53360 Mon Sep 17 00:00:00 2001 From: Ian Lewis Date: Thu, 9 Jul 2020 15:25:35 -0700 Subject: Add args and netns flag to runsc spec Adds a netns flag to runsc spec that allows users to specify a network namespace path when creating a sample config.json file. Also, adds the ability to specify the command arguments used when running the container. This will make it easier for new users to create sample OCI bundles without having to edit the config.json by hand. PiperOrigin-RevId: 320486267 --- g3doc/user_guide/quick_start/oci.md | 10 +- g3doc/user_guide/tutorials/cni.md | 14 ++- runsc/cmd/spec.go | 220 +++++++++++++++++++----------------- 3 files changed, 129 insertions(+), 115 deletions(-) (limited to 'g3doc') diff --git a/g3doc/user_guide/quick_start/oci.md b/g3doc/user_guide/quick_start/oci.md index 877169145..e7768946b 100644 --- a/g3doc/user_guide/quick_start/oci.md +++ b/g3doc/user_guide/quick_start/oci.md @@ -15,8 +15,8 @@ mkdir bundle cd bundle ``` -Create a root file system for the container. We will use the Docker hello-world -image as the basis for our container. +Create a root file system for the container. We will use the Docker +`hello-world` image as the basis for our container. ```bash mkdir rootfs @@ -24,12 +24,10 @@ docker export $(docker create hello-world) | tar -xf - -C rootfs ``` Next, create an specification file called `config.json` that contains our -container specification. We will update the default command it runs to `/hello` -in the `hello-world` container. +container specification. We tell the container to run the `/hello` program. ```bash -runsc spec -sed -i 's;"sh";"/hello";' config.json +runsc spec -- /hello ``` Finally run the container. diff --git a/g3doc/user_guide/tutorials/cni.md b/g3doc/user_guide/tutorials/cni.md index ad6c9fa59..ce2fd09a8 100644 --- a/g3doc/user_guide/tutorials/cni.md +++ b/g3doc/user_guide/tutorials/cni.md @@ -128,12 +128,14 @@ sudo mkdir -p rootfs/var/www/html sudo sh -c 'echo "Hello World!" > rootfs/var/www/html/index.html' ``` -Next create the `config.json` specifying the network namespace. `sudo -/usr/local/bin/runsc spec sudo sed -i 's;"sh";"python", "-m", "http.server";' -config.json sudo sed -i "s;\"cwd\": \"/\";\"cwd\": \"/var/www/html\";" -config.json sudo sed -i "s;\"type\": \"network\";\"type\": -\"network\",\n\t\t\t\t\"path\": \"/var/run/netns/${CNI_CONTAINERID}\";" -config.json` +Next create the `config.json` specifying the network namespace. + +``` +sudo /usr/local/bin/runsc spec \ + --cwd /var/www/html \ + --netns /var/run/netns/${CNI_CONTAINERID} \ + -- python -m http.server +``` ## Run the Container diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go index a2b0a4b14..55194e641 100644 --- a/runsc/cmd/spec.go +++ b/runsc/cmd/spec.go @@ -16,124 +16,122 @@ package cmd import ( "context" - "fmt" - "io/ioutil" + "encoding/json" + "io" "os" "path/filepath" "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/runsc/flag" ) -func genSpec(cwd string) []byte { - var template = fmt.Sprintf(`{ - "ociVersion": "1.0.0", - "process": { - "terminal": true, - "user": { - "uid": 0, - "gid": 0 - }, - "args": [ - "sh" - ], - "env": [ - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "TERM=xterm" - ], - "cwd": "%s", - "capabilities": { - "bounding": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "effective": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "inheritable": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "permitted": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "ambient": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ] - }, - "rlimits": [ - { - "type": "RLIMIT_NOFILE", - "hard": 1024, - "soft": 1024 - } - ] - }, - "root": { - "path": "rootfs", - "readonly": true - }, - "hostname": "runsc", - "mounts": [ - { - "destination": "/proc", - "type": "proc", - "source": "proc" +func writeSpec(w io.Writer, cwd string, netns string, args []string) error { + spec := &specs.Spec{ + Version: "1.0.0", + Process: &specs.Process{ + Terminal: true, + User: specs.User{ + UID: 0, + GID: 0, + }, + Args: args, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: cwd, + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Effective: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Inheritable: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Permitted: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + // TODO(gvisor.dev/issue/3166): support ambient capabilities + }, + Rlimits: []specs.POSIXRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: 1024, + Soft: 1024, + }, + }, }, - { - "destination": "/dev", - "type": "tmpfs", - "source": "tmpfs", - "options": [] + Root: &specs.Root{ + Path: "rootfs", + Readonly: true, }, - { - "destination": "/sys", - "type": "sysfs", - "source": "sysfs", - "options": [ - "nosuid", - "noexec", - "nodev", - "ro" - ] - } - ], - "linux": { - "namespaces": [ + Hostname: "runsc", + Mounts: []specs.Mount{ { - "type": "pid" + Destination: "/proc", + Type: "proc", + Source: "proc", }, { - "type": "network" + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", }, { - "type": "ipc" + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{ + "nosuid", + "noexec", + "nodev", + "ro", + }, }, - { - "type": "uts" + }, + Linux: &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "network", + Path: netns, + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, }, - { - "type": "mount" - } - ] + }, } -}`, cwd) - return []byte(template) + e := json.NewEncoder(w) + e.SetIndent("", " ") + return e.Encode(spec) } // Spec implements subcommands.Command for the "spec" command. type Spec struct { bundle string cwd string + netns string } // Name implements subcommands.Command.Name. @@ -148,21 +146,26 @@ func (*Spec) Synopsis() string { // Usage implements subcommands.Command.Usage. func (*Spec) Usage() string { - return `spec [options] - create a new OCI bundle specification file. + return `spec [options] [-- args...] - create a new OCI bundle specification file. + +The spec command creates a new specification file (config.json) for a new OCI +bundle. -The spec command creates a new specification file (config.json) for a new OCI bundle. +The specification file is a starter file that runs the command specified by +'args' in the container. If 'args' is not specified the default is to run the +'sh' program. -The specification file is a starter file that runs the "sh" command in the container. You -should edit the file to suit your needs. You can find out more about the format of the -specification file by visiting the OCI runtime spec repository: +While a number of flags are provided to change values in the specification, you +can examine the file and edit it to suit your needs after this command runs. +You can find out more about the format of the specification file by visiting +the OCI runtime spec repository: https://github.com/opencontainers/runtime-spec/ EXAMPLE: $ mkdir -p bundle/rootfs $ cd bundle - $ runsc spec + $ runsc spec -- /hello $ docker export $(docker create hello-world) | tar -xf - -C rootfs - $ sed -i 's;"sh";"/hello";' config.json $ sudo runsc run hello ` @@ -173,18 +176,29 @@ func (s *Spec) SetFlags(f *flag.FlagSet) { f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle") f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+ "this value MUST be an absolute path") + f.StringVar(&s.netns, "netns", "", "network namespace path") } // Execute implements subcommands.Command.Execute. func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + // Grab the arguments. + containerArgs := f.Args() + if len(containerArgs) == 0 { + containerArgs = []string{"sh"} + } + confPath := filepath.Join(s.bundle, "config.json") if _, err := os.Stat(confPath); !os.IsNotExist(err) { Fatalf("file %q already exists", confPath) } - var spec = genSpec(s.cwd) + configFile, err := os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE, 0664) + if err != nil { + Fatalf("opening file %q: %v", confPath, err) + } - if err := ioutil.WriteFile(confPath, spec, 0664); err != nil { + err = writeSpec(configFile, s.cwd, s.netns, containerArgs) + if err != nil { Fatalf("writing to %q: %v", confPath, err) } -- cgit v1.2.3 From 2afff44403e046078301de39f0252bb57fc018c7 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 5 May 2020 22:00:14 -0700 Subject: Update shim to build using bazel. The go.mod dependency tree for the shim was somehow contradictory. After resolving these issues (e.g. explicitly imported k8s 1.14, pulling a specific dbus version), and adding all dependencies, the shim can now be build as part of the regular bazel tree. As part of this process, minor cleanup was done in all the source files: headers were standardized (and include "The gVisor Authors" in addition to the "The containerd Authors" if originally derived from containerd sources), and comments were cleaned up to meet coding standards. This change makes the containerd installation dynamic, so that multiple versions can be tested, and drops the static installer for the VM image itself. This change also updates test/root/crictl_test.go and related utilities, so that the containerd tests can be run on any version (and in cases where it applies, they can be run on both v1 and v2 as parameterized tests). --- BUILD | 3 + Makefile | 23 +- WORKSPACE | 597 +++++++++++++++++++++++--- g3doc/user_guide/BUILD | 9 + g3doc/user_guide/runtimeclass.md | 44 ++ go.mod | 22 +- go.sum | 123 +++++- pkg/shim/runsc/BUILD | 16 + pkg/shim/runsc/runsc.go | 49 ++- pkg/shim/v1/proc/BUILD | 35 ++ pkg/shim/v1/proc/deleted_state.go | 12 +- pkg/shim/v1/proc/exec.go | 25 +- pkg/shim/v1/proc/exec_state.go | 16 +- pkg/shim/v1/proc/init.go | 80 ++-- pkg/shim/v1/proc/init_state.go | 18 +- pkg/shim/v1/proc/io.go | 5 +- pkg/shim/v1/proc/process.go | 6 +- pkg/shim/v1/proc/types.go | 10 +- pkg/shim/v1/proc/utils.go | 2 - pkg/shim/v1/shim/BUILD | 38 ++ pkg/shim/v1/shim/platform.go | 10 +- pkg/shim/v1/shim/service.go | 68 ++- pkg/shim/v1/utils/BUILD | 26 ++ pkg/shim/v1/utils/volumes.go | 41 +- pkg/shim/v2/BUILD | 41 ++ pkg/shim/v2/epoll.go | 6 +- pkg/shim/v2/options/BUILD | 11 + pkg/shim/v2/service.go | 115 ++--- pkg/shim/v2/service_linux.go | 10 +- pkg/test/criutil/criutil.go | 66 ++- runsc/BUILD | 12 +- runsc/debian/postinst.sh | 9 +- shim/BUILD | 15 + shim/README.md | 21 +- shim/configure-containerd-shim-runsc-v1.md | 72 ---- shim/configure-gvisor-containerd-shim.md | 42 -- shim/runsc.toml | 6 + shim/runtime-handler-quickstart.md | 251 ----------- shim/runtime-handler-shim-v2-quickstart.md | 232 ---------- shim/untrusted-workload-quickstart.md | 212 --------- shim/v1/BUILD | 43 ++ shim/v1/README.md | 50 +++ shim/v1/config.go | 40 ++ shim/v1/main.go | 247 ++++++++++- shim/v2/BUILD | 32 ++ shim/v2/README.md | 90 ++++ shim/v2/config.go | 40 -- shim/v2/main.go | 298 +------------ shim/v2/runtime-handler-shim-v2-quickstart.md | 232 ++++++++++ test/root/BUILD | 5 +- test/root/crictl_test.go | 452 ++++++++++++------- test/shim/containerd-install.sh | 44 -- test/shim/crictl-install.sh | 17 - test/shim/run-container.sh | 30 -- test/shim/runsc-install.sh | 8 - test/shim/runtime-handler-shim-v2/install.sh | 21 - test/shim/runtime-handler-shim-v2/test.sh | 34 -- test/shim/runtime-handler-shim-v2/validate.sh | 7 - test/shim/runtime-handler/install.sh | 24 -- test/shim/runtime-handler/test.sh | 33 -- test/shim/runtime-handler/usage.sh | 30 -- test/shim/runtimeclass-install.sh | 33 -- test/shim/shim-install.sh | 28 -- test/shim/untrusted-workload/install.sh | 27 -- test/shim/untrusted-workload/test.sh | 33 -- test/shim/untrusted-workload/usage.sh | 33 -- test/shim/validate.sh | 17 - tools/bazel.mk | 2 +- tools/installers/BUILD | 10 +- tools/installers/containerd.sh | 114 +++++ tools/installers/head.sh | 8 +- tools/vm/ubuntu1604/25_docker.sh | 65 --- tools/vm/ubuntu1604/30_containerd.sh | 86 ---- tools/vm/ubuntu1604/30_docker.sh | 65 +++ website/BUILD | 3 + 75 files changed, 2503 insertions(+), 2197 deletions(-) create mode 100644 g3doc/user_guide/runtimeclass.md create mode 100644 pkg/shim/runsc/BUILD create mode 100644 pkg/shim/v1/proc/BUILD create mode 100644 pkg/shim/v1/shim/BUILD create mode 100644 pkg/shim/v1/utils/BUILD create mode 100644 pkg/shim/v2/BUILD create mode 100644 pkg/shim/v2/options/BUILD create mode 100644 shim/BUILD delete mode 100644 shim/configure-containerd-shim-runsc-v1.md delete mode 100644 shim/configure-gvisor-containerd-shim.md create mode 100644 shim/runsc.toml delete mode 100644 shim/runtime-handler-quickstart.md delete mode 100644 shim/runtime-handler-shim-v2-quickstart.md delete mode 100644 shim/untrusted-workload-quickstart.md create mode 100644 shim/v1/BUILD create mode 100644 shim/v1/README.md create mode 100644 shim/v1/config.go create mode 100644 shim/v2/BUILD create mode 100644 shim/v2/README.md delete mode 100644 shim/v2/config.go create mode 100644 shim/v2/runtime-handler-shim-v2-quickstart.md delete mode 100755 test/shim/containerd-install.sh delete mode 100755 test/shim/crictl-install.sh delete mode 100755 test/shim/run-container.sh delete mode 100755 test/shim/runsc-install.sh delete mode 100755 test/shim/runtime-handler-shim-v2/install.sh delete mode 100755 test/shim/runtime-handler-shim-v2/test.sh delete mode 100755 test/shim/runtime-handler-shim-v2/validate.sh delete mode 100755 test/shim/runtime-handler/install.sh delete mode 100755 test/shim/runtime-handler/test.sh delete mode 100755 test/shim/runtime-handler/usage.sh delete mode 100755 test/shim/runtimeclass-install.sh delete mode 100755 test/shim/shim-install.sh delete mode 100755 test/shim/untrusted-workload/install.sh delete mode 100755 test/shim/untrusted-workload/test.sh delete mode 100755 test/shim/untrusted-workload/usage.sh delete mode 100755 test/shim/validate.sh create mode 100755 tools/installers/containerd.sh delete mode 100755 tools/vm/ubuntu1604/25_docker.sh delete mode 100755 tools/vm/ubuntu1604/30_containerd.sh create mode 100755 tools/vm/ubuntu1604/30_docker.sh (limited to 'g3doc') diff --git a/BUILD b/BUILD index 962d54821..5d0dbde4c 100644 --- a/BUILD +++ b/BUILD @@ -69,7 +69,10 @@ go_path( name = "gopath", mode = "link", deps = [ + # Main binary. "//runsc", + "//shim/v1:gvisor-containerd-shim", + "//shim/v2:containerd-shim-runsc-v1", # Packages that are not dependencies of //runsc. "//pkg/sentry/kernel/memevent", diff --git a/Makefile b/Makefile index 85818ebea..58648f5d2 100644 --- a/Makefile +++ b/Makefile @@ -121,6 +121,22 @@ tests: ## Runs all local ptrace system call tests. @$(MAKE) test OPTIONS="--test_tag_filters runsc_ptrace test/syscalls/..." .PHONY: tests +containerd-test-%: ## Runs a local containerd test. +containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd +containerd-test-%: install-test-runtime + @CONTAINERD_VERSION=$* $(MAKE) sudo TARGETS="tools/installers:containerd" + @$(MAKE) sudo TARGETS="test/root:root_test" ARGS="-test.v" + +# Note that we can't run containerd-test-1.1.8 tests here. +# +# Containerd 1.1.8 should work, but because of a bug in loading images locally +# (https://github.com/kubernetes-sigs/cri-tools/issues/421), we are unable to +# actually drive the tests. The v1 API is tested exclusively through 1.2.13. +containerd-tests: ## Runs all supported containerd version tests. +containerd-tests: containerd-test-1.2.13 +containerd-tests: containerd-test-1.3.4 +containerd-tests: containerd-test-1.4.0-beta.0 + ## ## Website & documentation helpers. ## @@ -233,11 +249,14 @@ dev: ## Installs a set of local runtimes. Requires sudo. refresh: ## Refreshes the runtime binary (for development only). Must have called 'dev' or 'test-install' first. @mkdir -p "$(RUNTIME_DIR)" - @$(MAKE) copy TARGETS=runsc DESTINATION="$(RUNTIME_BIN)" && chmod 0755 "$(RUNTIME_BIN)" + @$(MAKE) copy TARGETS=runsc DESTINATION="$(RUNTIME_BIN)" + @$(MAKE) copy TARGETS=shim/v1:gvisor-containerd-shim DESTINATION="$(RUNTIME_DIR)" + @$(MAKE) copy TARGETS=shim/v2:containerd-shim-runsc-v1 DESTINATION="$(RUNTIME_DIR)" .PHONY: install -test-install: ## Installs the runtime for testing. Requires sudo. +install-test-runtime: ## Installs the runtime for testing. Requires sudo. @$(MAKE) refresh ARGS="--net-raw --TESTONLY-test-name-env=RUNSC_TEST_NAME --debug --strace --log-packets $(ARGS)" + @$(MAKE) configure RUNTIME=runsc @$(MAKE) configure @sudo systemctl restart docker .PHONY: install-test diff --git a/WORKSPACE b/WORKSPACE index 417ec6100..b3e97b0cc 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -4,11 +4,11 @@ load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") # Bazel/starlark utilities. http_archive( name = "bazel_skylib", + sha256 = "97e70364e9249702246c0e9444bccdc4b847bed1eb03c5a3ece4f83dfe6abc44", urls = [ "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz", "https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz", ], - sha256 = "97e70364e9249702246c0e9444bccdc4b847bed1eb03c5a3ece4f83dfe6abc44", ) load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace") @@ -159,7 +159,38 @@ load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps") grpc_extra_deps() -# External repositories, in sorted order. +# System Call test dependencies. +http_archive( + name = "com_google_absl", + sha256 = "56775f1283a59e6274c28d99981a9717ff4e0b1161e9129fdb2fcf22531d8d93", + strip_prefix = "abseil-cpp-a0d1e098c2f99694fa399b175a7ccf920762030e", + urls = [ + "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/a0d1e098c2f99694fa399b175a7ccf920762030e.tar.gz", + "https://github.com/abseil/abseil-cpp/archive/a0d1e098c2f99694fa399b175a7ccf920762030e.tar.gz", + ], +) + +http_archive( + name = "com_google_googletest", + sha256 = "0a10bea96d8670e5eef948d79d824162b1577bb7889539e49ec786bfc3e48912", + strip_prefix = "googletest-565f1b848215b77c3732bca345fe76a0431d8b34", + urls = [ + "https://mirror.bazel.build/github.com/google/googletest/archive/565f1b848215b77c3732bca345fe76a0431d8b34.tar.gz", + "https://github.com/google/googletest/archive/565f1b848215b77c3732bca345fe76a0431d8b34.tar.gz", + ], +) + +http_archive( + name = "com_google_benchmark", + sha256 = "3c6a165b6ecc948967a1ead710d4a181d7b0fbcaa183ef7ea84604994966221a", + strip_prefix = "benchmark-1.5.0", + urls = [ + "https://mirror.bazel.build/github.com/google/benchmark/archive/v1.5.0.tar.gz", + "https://github.com/google/benchmark/archive/v1.5.0.tar.gz", + ], +) + +# Go repositories, in gazelle order. go_repository( name = "com_github_cenkalti_backoff", importpath = "github.com/cenkalti/backoff", @@ -177,15 +208,15 @@ go_repository( go_repository( name = "com_github_golang_mock", importpath = "github.com/golang/mock", - sum = "h1:qGJ6qTW+x6xX/my+8YUVl4WNpX9B7+/l2tRsHGZ7f2s=", - version = "v1.3.1", + sum = "h1:G5FRp8JnTd7RQH5kemVNlMeyXQAztQ3mOWV95KxsXH8=", + version = "v1.1.1", ) go_repository( name = "com_github_google_go-cmp", importpath = "github.com/google/go-cmp", - sum = "h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ=", - version = "v0.2.0", + sum = "h1:Xye71clBPdm5HgqGwUkwhbynsUJZhDbS20FvLhQ2izg=", + version = "v0.3.1", ) go_repository( @@ -232,8 +263,8 @@ go_repository( go_repository( name = "com_github_opencontainers_runtime-spec", importpath = "github.com/opencontainers/runtime-spec", - sum = "h1:d9F+LNYwMyi3BDN4GzZdaSiq4otb8duVEWyZjeUtOQI=", - version = "v0.1.2-0.20171211145439-b2d941ef6a78", + sum = "h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0=", + version = "v1.0.2", ) go_repository( @@ -261,8 +292,8 @@ go_repository( name = "org_golang_google_grpc", build_file_proto_mode = "disable", importpath = "google.golang.org/grpc", - sum = "h1:zvIju4sqAGvwKspUQOhwnpcqSbzi7/H6QomNNjTL4sk=", - version = "v1.27.1", + sum = "h1:Mm8atZtkT+P6R43n/dqNDWkPPu5BwRVu/1rJnJCeZH8=", + version = "v1.12.0", ) go_repository( @@ -289,8 +320,8 @@ go_repository( go_repository( name = "org_golang_x_net", importpath = "golang.org/x/net", - sum = "h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI=", - version = "v0.0.0-20190620200207-3b0461eec859", + sum = "h1:FCqk7JXVeupwwnGVopQCC0a0xRK0Rj7SL5AyjjWo4pk=", + version = "v0.0.0-20170716174642-b3756b4b77d7", ) go_repository( @@ -303,8 +334,8 @@ go_repository( go_repository( name = "org_golang_x_text", importpath = "golang.org/x/text", - sum = "h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=", - version = "v0.3.0", + sum = "h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=", + version = "v0.3.2", ) go_repository( @@ -317,8 +348,8 @@ go_repository( go_repository( name = "org_golang_x_tools", importpath = "golang.org/x/tools", - sum = "h1:Uglradbb4KfUWaYasZhlsDsGRwHHvRsHoNAEONef0W8=", - version = "v0.0.0-20200131233409-575de47986ce", + sum = "h1:NIou6eNFigscvKJmsbyez16S2cIS6idossORlFtSt2E=", + version = "v0.0.0-20181030221726-6c7e314b6563", ) go_repository( @@ -352,8 +383,8 @@ go_repository( go_repository( name = "org_golang_x_oauth2", importpath = "golang.org/x/oauth2", - sum = "h1:pE8b58s1HRDMi8RDc79m0HISf9D4TzseP40cEA6IGfs=", - version = "v0.0.0-20191202225959-858c2ad4c8b6", + sum = "h1:vEDujvNQGv4jgYKudGeI/+DAX4Jffq6hpD55MmoEvKs=", + version = "v0.0.0-20180821212333-d2e6202438be", ) go_repository( @@ -497,12 +528,11 @@ go_repository( version = "v1.5.0", ) -# BigQuery Dependencies for Benchmarks go_repository( name = "com_google_cloud_go", importpath = "cloud.google.com/go", - sum = "h1:eoz/lYxKSL4CNAiaUJ0ZfD1J3bfMYbU5B3rwM1C1EIU=", - version = "v0.55.0", + sum = "h1:e0WKqKTd5BnrG8aKH3J3h+QvEIQtSUcf2n5UZ5ZgLtQ=", + version = "v0.26.0", ) go_repository( @@ -515,8 +545,8 @@ go_repository( go_repository( name = "io_opencensus_go", importpath = "go.opencensus.io", - sum = "h1:8sGtKOrtQqkN1bp2AtX+misvLIlOmsEsNd+9NIcPEm8=", - version = "v0.22.3", + sum = "h1:C9hSCOW830chIVkdja34wa6Ky+IzWllkUinR+BtRZd4=", + version = "v0.22.0", ) go_repository( @@ -526,34 +556,505 @@ go_repository( version = "v0.0.0-20200121045136-8c9f03a8e57e", ) -# System Call test dependencies. -http_archive( - name = "com_google_absl", - sha256 = "56775f1283a59e6274c28d99981a9717ff4e0b1161e9129fdb2fcf22531d8d93", - strip_prefix = "abseil-cpp-a0d1e098c2f99694fa399b175a7ccf920762030e", - urls = [ - "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/a0d1e098c2f99694fa399b175a7ccf920762030e.tar.gz", - "https://github.com/abseil/abseil-cpp/archive/a0d1e098c2f99694fa399b175a7ccf920762030e.tar.gz", - ], +go_repository( + name = "com_github_census_instrumentation_opencensus_proto", + importpath = "github.com/census-instrumentation/opencensus-proto", + sum = "h1:glEXhBS5PSLLv4IXzLA5yPRVX4bilULVyxxbrfOtDAk=", + version = "v0.2.1", ) -http_archive( - name = "com_google_googletest", - sha256 = "0a10bea96d8670e5eef948d79d824162b1577bb7889539e49ec786bfc3e48912", - strip_prefix = "googletest-565f1b848215b77c3732bca345fe76a0431d8b34", - urls = [ - "https://mirror.bazel.build/github.com/google/googletest/archive/565f1b848215b77c3732bca345fe76a0431d8b34.tar.gz", - "https://github.com/google/googletest/archive/565f1b848215b77c3732bca345fe76a0431d8b34.tar.gz", - ], +go_repository( + name = "com_github_client9_misspell", + importpath = "github.com/client9/misspell", + sum = "h1:ta993UF76GwbvJcIo3Y68y/M3WxlpEHPWIGDkJYwzJI=", + version = "v0.3.4", ) -http_archive( - name = "com_google_benchmark", - sha256 = "3c6a165b6ecc948967a1ead710d4a181d7b0fbcaa183ef7ea84604994966221a", - strip_prefix = "benchmark-1.5.0", - urls = [ - "https://mirror.bazel.build/github.com/google/benchmark/archive/v1.5.0.tar.gz", - "https://github.com/google/benchmark/archive/v1.5.0.tar.gz", - ], +go_repository( + name = "com_github_cncf_udpa_go", + importpath = "github.com/cncf/udpa/go", + sum = "h1:WBZRG4aNOuI15bLRrCgN8fCq8E5Xuty6jGbmSNEvSsU=", + version = "v0.0.0-20191209042840-269d4d468f6f", +) + +# K8s must be pinned to 1.14, prior to APIs being split into separate +# repositories. The containerd versions below assume the existence of the cri +# API within the kubelet package. +go_repository( + name = "io_k8s_kubernetes", + importpath = "k8s.io/kubernetes", + sum = "h1:6T2iAEoOYQnzQb3WvPlUkcczEEXZ7+YPlAO8olwujRw=", + version = "v1.14.0", +) + +go_repository( + name = "com_github_containerd_cgroups", + build_file_proto_mode = "disable", + importpath = "github.com/containerd/cgroups", + sum = "h1:5/YbYIVboEqSpFSC/iMO9FOIP/q8RIuKfYS2TA1M8WE=", + version = "v0.0.0-20200520162414-666f4a009ffb", +) + +go_repository( + name = "com_github_containerd_console", + build_file_proto_mode = "disable", + importpath = "github.com/containerd/console", + sum = "h1:uict5mhHFTzKLUCufdSLym7z/J0CbBJT59lYbP9wtbg=", + version = "v0.0.0-20180822173158-c12b1e7919c1", +) + +go_repository( + name = "com_github_containerd_containerd", + build_file_proto_mode = "disable", + importpath = "github.com/containerd/containerd", + sum = "h1:BmZa1bGjKctYrIbyjbhZJlGvHceJASpdW5pIDSQcw1E=", + version = "v0.0.0-20190510190154-d0319ec44af6", +) + +go_repository( + name = "com_github_containerd_continuity", + build_file_proto_mode = "disable", + importpath = "github.com/containerd/continuity", + sum = "h1:tN9D97v5A5QuKdcKHKt+UMKrkQ5YXUnD8iM7IAAjEfI=", + version = "v0.0.0-20190815185530-f2a389ac0a02", +) + +go_repository( + name = "com_github_containerd_cri", + build_file_proto_mode = "disable", + importpath = "github.com/containerd/cri", + sum = "h1:+bW7GQb2q32/Liy0ZiR6pkpRXdDHShUXRoWg8OGVWZs=", + version = "v0.0.0-20190308093238-8a0bd84b9a4c", +) + +go_repository( + name = "com_github_containerd_fifo", + build_file_proto_mode = "disable", + importpath = "github.com/containerd/fifo", + sum = "h1:XGyg7oTtD0DoRFhbpV6x1WfV0flKC4UxXU7ab1zC08U=", + version = "v0.0.0-20180307165137-3d5202aec260", +) + +go_repository( + name = "com_github_containerd_go_runc", + build_file_proto_mode = "disable", + importpath = "github.com/containerd/go-runc", + sum = "h1:esQOJREg8nw8aXj6uCN5dfW5cKUBiEJ/+nni1Q/D/sw=", + version = "v0.0.0-20180907222934-5a6d9f37cfa3", +) + +go_repository( + name = "com_github_containerd_ttrpc", + build_file_proto_mode = "disable", + importpath = "github.com/containerd/ttrpc", + sum = "h1:SKDlsIhYxNE1LO0xwuOR+3QWj3zRibVQu5jWIMQmOfU=", + version = "v0.0.0-20190411181408-699c4e40d1e7", +) + +go_repository( + name = "com_github_containerd_typeurl", + build_file_proto_mode = "disable", + importpath = "github.com/containerd/typeurl", + sum = "h1:JNn81o/xG+8NEo3bC/vx9pbi/g2WI8mtP2/nXzu297Y=", + version = "v0.0.0-20180627222232-a93fcdb778cd", +) + +go_repository( + name = "com_github_coreos_go_systemd", + importpath = "github.com/coreos/go-systemd", + sum = "h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU=", + version = "v0.0.0-20191104093116-d3cd4ed1dbcf", +) + +go_repository( + name = "com_github_davecgh_go_spew", + importpath = "github.com/davecgh/go-spew", + sum = "h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=", + version = "v1.1.1", +) + +go_repository( + name = "com_github_docker_distribution", + importpath = "github.com/docker/distribution", + sum = "h1:a5mlkVzth6W5A4fOsS3D2EO5BUmsJpcB+cRlLU7cSug=", + version = "v2.7.1+incompatible", +) + +go_repository( + name = "com_github_docker_go_events", + importpath = "github.com/docker/go-events", + sum = "h1:+pKlWGMw7gf6bQ+oDZB4KHQFypsfjYlq/C4rfL7D3g8=", + version = "v0.0.0-20190806004212-e31b211e4f1c", +) + +go_repository( + name = "com_github_docker_go_units", + importpath = "github.com/docker/go-units", + sum = "h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=", + version = "v0.4.0", +) + +go_repository( + name = "com_github_dustin_go_humanize", + importpath = "github.com/dustin/go-humanize", + sum = "h1:qk/FSDDxo05wdJH28W+p5yivv7LuLYLRXPPD8KQCtZs=", + version = "v0.0.0-20171111073723-bb3d318650d4", +) + +go_repository( + name = "com_github_envoyproxy_go_control_plane", + importpath = "github.com/envoyproxy/go-control-plane", + sum = "h1:rEvIZUSZ3fx39WIi3JkQqQBitGwpELBIYWeBVh6wn+E=", + version = "v0.9.4", +) + +go_repository( + name = "com_github_envoyproxy_protoc_gen_validate", + importpath = "github.com/envoyproxy/protoc-gen-validate", + sum = "h1:EQciDnbrYxy13PgWoY8AqoxGiPrpgBZ1R8UNe3ddc+A=", + version = "v0.1.0", +) + +go_repository( + name = "com_github_fsnotify_fsnotify", + importpath = "github.com/fsnotify/fsnotify", + sum = "h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=", + version = "v1.4.7", +) + +go_repository( + name = "com_github_godbus_dbus", + importpath = "github.com/godbus/dbus", + sum = "h1:ZpnhV/YsD2/4cESfV5+Hoeu/iUR3ruzNvZ+yQfO03a0=", + version = "v0.0.0-20190726142602-4481cbc300e2", +) + +go_repository( + name = "com_github_gogo_googleapis", + importpath = "github.com/gogo/googleapis", + sum = "h1:zgVt4UpGxcqVOw97aRGxT4svlcmdK35fynLNctY32zI=", + version = "v1.4.0", +) + +go_repository( + name = "com_github_gogo_protobuf", + importpath = "github.com/gogo/protobuf", + sum = "h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls=", + version = "v1.3.1", +) + +go_repository( + name = "com_github_golang_glog", + importpath = "github.com/golang/glog", + sum = "h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58=", + version = "v0.0.0-20160126235308-23def4e6c14b", +) + +go_repository( + name = "com_github_hashicorp_golang_lru", + importpath = "github.com/hashicorp/golang-lru", + sum = "h1:0hERBMJE1eitiLkihrMvRVBYAkpHzc/J3QdDN+dAcgU=", + version = "v0.5.1", +) + +go_repository( + name = "com_github_hpcloud_tail", + importpath = "github.com/hpcloud/tail", + sum = "h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=", + version = "v1.0.0", +) + +go_repository( + name = "com_github_inconshreveable_mousetrap", + importpath = "github.com/inconshreveable/mousetrap", + sum = "h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM=", + version = "v1.0.0", +) + +go_repository( + name = "com_github_kisielk_errcheck", + importpath = "github.com/kisielk/errcheck", + sum = "h1:reN85Pxc5larApoH1keMBiu2GWtPqXQ1nc9gx+jOU+E=", + version = "v1.2.0", +) + +go_repository( + name = "com_github_kisielk_gotool", + importpath = "github.com/kisielk/gotool", + sum = "h1:AV2c/EiW3KqPNT9ZKl07ehoAGi4C5/01Cfbblndcapg=", + version = "v1.0.0", +) + +go_repository( + name = "com_github_konsorten_go_windows_terminal_sequences", + importpath = "github.com/konsorten/go-windows-terminal-sequences", + sum = "h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s=", + version = "v1.0.2", +) + +go_repository( + name = "com_github_microsoft_go_winio", + importpath = "github.com/Microsoft/go-winio", + sum = "h1:+hMXMk01us9KgxGb7ftKQt2Xpf5hH/yky+TDA+qxleU=", + version = "v0.4.14", +) + +go_repository( + name = "com_github_microsoft_hcsshim", + importpath = "github.com/Microsoft/hcsshim", + sum = "h1:ZfF0+zZeYdzMIVMZHKtDKJvLHj76XCuVae/jNkjj0IA=", + version = "v0.8.6", +) + +go_repository( + name = "com_github_onsi_ginkgo", + importpath = "github.com/onsi/ginkgo", + sum = "h1:q/mM8GF/n0shIN8SaAZ0V+jnLPzen6WIVZdiwrRlMlo=", + version = "v1.10.1", +) + +go_repository( + name = "com_github_onsi_gomega", + importpath = "github.com/onsi/gomega", + sum = "h1:XPnZz8VVBHjVsy1vzJmRwIcSwiUO+JFfrv/xGiigmME=", + version = "v1.7.0", +) + +go_repository( + name = "com_github_opencontainers_go_digest", + importpath = "github.com/opencontainers/go-digest", + sum = "h1:QhPf3A2AZW3tTGvHPg0TA+CR3oHbVLlXUhlghqISp1I=", + version = "v0.0.0-20180430190053-c9281466c8b2", +) + +go_repository( + name = "com_github_opencontainers_image_spec", + importpath = "github.com/opencontainers/image-spec", + sum = "h1:JMemWkRwHx4Zj+fVxWoMCFm/8sYGGrUVojFA6h/TRcI=", + version = "v1.0.1", +) + +go_repository( + name = "com_github_opencontainers_runc", + importpath = "github.com/opencontainers/runc", + sum = "h1:dDCFes8Hj1r/i5qnypONo5jdOme/8HWZC/aNDyhECt0=", + version = "v1.0.0-rc8", +) + +go_repository( + name = "com_github_pkg_errors", + importpath = "github.com/pkg/errors", + sum = "h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=", + version = "v0.9.1", +) + +go_repository( + name = "com_github_pmezard_go_difflib", + importpath = "github.com/pmezard/go-difflib", + sum = "h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=", + version = "v1.0.0", +) + +go_repository( + name = "com_github_prometheus_client_model", + importpath = "github.com/prometheus/client_model", + sum = "h1:gQz4mCbXsO+nc9n1hCxHcGA3Zx3Eo+UHZoInFGUIXNM=", + version = "v0.0.0-20190812154241-14fe0d1b01d4", +) + +go_repository( + name = "com_github_prometheus_procfs", + importpath = "github.com/prometheus/procfs", + sum = "h1:hhvfGDVThBnd4kYisSFmYuHYeUhglxcwag7FhVPH9zM=", + version = "v0.0.0-20180125133057-cb4147076ac7", +) + +go_repository( + name = "com_github_sirupsen_logrus", + importpath = "github.com/sirupsen/logrus", + sum = "h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4=", + version = "v1.4.2", +) + +go_repository( + name = "com_github_spf13_cobra", + importpath = "github.com/spf13/cobra", + sum = "h1:GQkkv3XSnxhAMjdq2wLfEnptEVr+2BNvmHizILHn+d4=", + version = "v0.0.2-0.20171109065643-2da4a54c5cee", +) + +go_repository( + name = "com_github_spf13_pflag", + importpath = "github.com/spf13/pflag", + sum = "h1:j8jxLbQ0+T1DFggy6XoGvyUnrJWPR/JybflPvu5rwS4=", + version = "v1.0.1-0.20171106142849-4c012f6dcd95", +) + +go_repository( + name = "com_github_stretchr_objx", + importpath = "github.com/stretchr/objx", + sum = "h1:2vfRuCMp5sSVIDSqO8oNnWJq7mPa6KVP3iPIwFBuy8A=", + version = "v0.1.1", +) + +go_repository( + name = "com_github_stretchr_testify", + importpath = "github.com/stretchr/testify", + sum = "h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w=", + version = "v1.2.2", +) + +go_repository( + name = "com_github_urfave_cli", + importpath = "github.com/urfave/cli", + sum = "h1:gsqYFH8bb9ekPA12kRo0hfjngWQjkJPlN9R0N78BoUo=", + version = "v1.22.2", +) + +go_repository( + name = "in_gopkg_airbrake_gobrake_v2", + importpath = "gopkg.in/airbrake/gobrake.v2", + sum = "h1:7z2uVWwn7oVeeugY1DtlPAy5H+KYgB1KeKTnqjNatLo=", + version = "v2.0.9", +) + +go_repository( + name = "in_gopkg_fsnotify_v1", + importpath = "gopkg.in/fsnotify.v1", + sum = "h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4=", + version = "v1.4.7", +) + +go_repository( + name = "in_gopkg_gemnasium_logrus_airbrake_hook_v2", + importpath = "gopkg.in/gemnasium/logrus-airbrake-hook.v2", + sum = "h1:OAj3g0cR6Dx/R07QgQe8wkA9RNjB2u4i700xBkIT4e0=", + version = "v2.1.2", +) + +go_repository( + name = "in_gopkg_tomb_v1", + importpath = "gopkg.in/tomb.v1", + sum = "h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=", + version = "v1.0.0-20141024135613-dd632973f1e7", +) + +go_repository( + name = "in_gopkg_yaml_v2", + importpath = "gopkg.in/yaml.v2", + sum = "h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=", + version = "v2.2.2", +) + +go_repository( + name = "org_bazil_fuse", + importpath = "bazil.org/fuse", + sum = "h1:SC+c6A1qTFstO9qmB86mPV2IpYme/2ZoEQ0hrP+wo+Q=", + version = "v0.0.0-20160811212531-371fbbdaa898", +) + +go_repository( + name = "org_golang_google_appengine", + importpath = "google.golang.org/appengine", + sum = "h1:/wp5JvzpHIxhs/dumFmF7BXTf3Z+dd4uXta4kVyO508=", + version = "v1.4.0", +) + +go_repository( + name = "org_golang_google_genproto", + importpath = "google.golang.org/genproto", + sum = "h1:wVJP1pATLVPNxCz4R2mTO6HUJgfGE0PmIu2E10RuhCw=", + version = "v0.0.0-20170523043604-d80a6e20e776", +) + +go_repository( + name = "org_golang_x_exp", + importpath = "golang.org/x/exp", + sum = "h1:c2HOrn5iMezYjSlGPncknSEr/8x5LELb/ilJbXi9DEA=", + version = "v0.0.0-20190121172915-509febef88a4", +) + +go_repository( + name = "org_golang_x_lint", + importpath = "golang.org/x/lint", + sum = "h1:XQyxROzUlZH+WIQwySDgnISgOivlhjIEwaQaJEJrrN0=", + version = "v0.0.0-20190313153728-d0100b6bd8b3", +) + +go_repository( + name = "tools_gotest", + importpath = "gotest.tools", + sum = "h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo=", + version = "v2.2.0+incompatible", +) + +go_repository( + name = "co_honnef_go_tools", + importpath = "honnef.co/go/tools", + sum = "h1:/hemPrYIhOhy8zYrNj+069zDB68us2sMGsfkFJO0iZs=", + version = "v0.0.0-20190523083050-ea95bdfd59fc", +) + +go_repository( + name = "com_github_burntsushi_toml", + importpath = "github.com/BurntSushi/toml", + sum = "h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=", + version = "v0.3.1", +) + +go_repository( + name = "io_k8s_cri_api", + importpath = "k8s.io/cri-api", + sum = "h1:bykYbClh5Bnjo2EMjlYbYQ3ksxHjjLcbriKPm831hVk=", + version = "v0.18.2", +) + +go_repository( + name = "com_github_docker_docker", + importpath = "github.com/docker/docker", + sum = "h1:Bh3QS4GYuVi8QeNskrV3ivn8p0bupmk0PfY4xmVulo4=", + version = "v17.12.0-ce-rc1.0.20200514230353-811a247d06e8+incompatible", +) + +go_repository( + name = "com_github_cilium_ebpf", + importpath = "github.com/cilium/ebpf", + sum = "h1:i8+1fuPLjSgAYXUyBlHNhFwjcfAsP4ufiuH1+PWkyDU=", + version = "v0.0.0-20200110133405-4032b1d8aae3", +) + +go_repository( + name = "com_github_coreos_go_systemd_v22", + importpath = "github.com/coreos/go-systemd/v22", + sum = "h1:XJIw/+VlJ+87J+doOxznsAWIdmWuViOVhkQamW5YV28=", + version = "v22.0.0", +) + +go_repository( + name = "com_github_cpuguy83_go_md2man_v2", + importpath = "github.com/cpuguy83/go-md2man/v2", + sum = "h1:EoUDS0afbrsXAZ9YQ9jdu/mZ2sXgT1/2yyNng4PGlyM=", + version = "v2.0.0", +) + +go_repository( + name = "com_github_godbus_dbus_v5", + importpath = "github.com/godbus/dbus/v5", + sum = "h1:ZqHaoEF7TBzh4jzPmqVhE/5A1z9of6orkAe5uHoAeME=", + version = "v5.0.3", +) + +go_repository( + name = "com_github_russross_blackfriday_v2", + importpath = "github.com/russross/blackfriday/v2", + sum = "h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=", + version = "v2.0.1", +) + +go_repository( + name = "com_github_shurcool_sanitized_anchor_name", + importpath = "github.com/shurcooL/sanitized_anchor_name", + sum = "h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=", + version = "v1.0.0", ) diff --git a/g3doc/user_guide/BUILD b/g3doc/user_guide/BUILD index b69aee12c..355dd49b3 100644 --- a/g3doc/user_guide/BUILD +++ b/g3doc/user_guide/BUILD @@ -68,3 +68,12 @@ doc( permalink = "/docs/user_guide/platforms/", weight = "30", ) + +doc( + name = "runtimeclass", + src = "runtimeclass.md", + category = "User Guide", + permalink = "/docs/user_guide/runtimeclass/", + subcategory = "Advanced", + weight = "91", +) diff --git a/g3doc/user_guide/runtimeclass.md b/g3doc/user_guide/runtimeclass.md new file mode 100644 index 000000000..9f2d794c3 --- /dev/null +++ b/g3doc/user_guide/runtimeclass.md @@ -0,0 +1,44 @@ +# RuntimeClass + +First, follow the appropriate installation instructions for your version of +containerd. + +* For 1.1 or lower, use `gvisor-containerd-shim`. +* For 1.2 or higher, use `containerd-shim-runsc-v1`. + +# Set up the Kubernetes RuntimeClass + +Creating the RuntimeClass in kubernetes is simple once the runtime is available +for containerd: + +```shell +cat <: -// +// runOrError will run the provided command. +// +// If an error is encountered and neither Stdout or Stderr was set the error +// will be returned in the format of : . func (r *Runsc) runOrError(cmd *exec.Cmd) error { if cmd.Stdout != nil || cmd.Stderr != nil { ec, err := Monitor.Start(cmd) diff --git a/pkg/shim/v1/proc/BUILD b/pkg/shim/v1/proc/BUILD new file mode 100644 index 000000000..a59bf198d --- /dev/null +++ b/pkg/shim/v1/proc/BUILD @@ -0,0 +1,35 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "proc", + srcs = [ + "deleted_state.go", + "exec.go", + "exec_state.go", + "init.go", + "init_state.go", + "io.go", + "process.go", + "types.go", + "utils.go", + ], + visibility = [ + "//pkg/shim:__subpackages__", + "//shim:__subpackages__", + ], + deps = [ + "//pkg/shim/runsc", + "@com_github_containerd_console//:go_default_library", + "@com_github_containerd_containerd//errdefs:go_default_library", + "@com_github_containerd_containerd//log:go_default_library", + "@com_github_containerd_containerd//mount:go_default_library", + "@com_github_containerd_containerd//runtime/proc:go_default_library", + "@com_github_containerd_fifo//:go_default_library", + "@com_github_containerd_go_runc//:go_default_library", + "@com_github_gogo_protobuf//types:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/shim/v1/proc/deleted_state.go b/pkg/shim/v1/proc/deleted_state.go index 0196c96dd..52aad40ac 100644 --- a/pkg/shim/v1/proc/deleted_state.go +++ b/pkg/shim/v1/proc/deleted_state.go @@ -17,33 +17,33 @@ package proc import ( "context" + "fmt" "github.com/containerd/console" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/runtime/proc" - "github.com/pkg/errors" ) type deletedState struct{} func (*deletedState) Resize(ws console.WinSize) error { - return errors.Errorf("cannot resize a deleted process") + return fmt.Errorf("cannot resize a deleted process") } func (*deletedState) Start(ctx context.Context) error { - return errors.Errorf("cannot start a deleted process") + return fmt.Errorf("cannot start a deleted process") } func (*deletedState) Delete(ctx context.Context) error { - return errors.Wrap(errdefs.ErrNotFound, "cannot delete a deleted process") + return fmt.Errorf("cannot delete a deleted process: %w", errdefs.ErrNotFound) } func (*deletedState) Kill(ctx context.Context, sig uint32, all bool) error { - return errors.Wrap(errdefs.ErrNotFound, "cannot kill a deleted process") + return fmt.Errorf("cannot kill a deleted process: %w", errdefs.ErrNotFound) } func (*deletedState) SetExited(status int) {} func (*deletedState) Exec(ctx context.Context, path string, r *ExecConfig) (proc.Process, error) { - return nil, errors.Errorf("cannot exec in a deleted state") + return nil, fmt.Errorf("cannot exec in a deleted state") } diff --git a/pkg/shim/v1/proc/exec.go b/pkg/shim/v1/proc/exec.go index 6821e09cd..4ef9cf6cf 100644 --- a/pkg/shim/v1/proc/exec.go +++ b/pkg/shim/v1/proc/exec.go @@ -31,7 +31,6 @@ import ( "github.com/containerd/fifo" runc "github.com/containerd/go-runc" specs "github.com/opencontainers/runtime-spec/specs-go" - "github.com/pkg/errors" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/shim/runsc" @@ -151,9 +150,11 @@ func (e *execProcess) kill(ctx context.Context, sig uint32, _ bool) error { if err := e.parent.runtime.Kill(ctx, e.parent.id, int(sig), &runsc.KillOpts{ Pid: internalPid, }); err != nil { - // If this returns error, consider the process has already stopped. + // If this returns error, consider the process has + // already stopped. + // // TODO: Fix after signal handling is fixed. - return errors.Wrapf(errdefs.ErrNotFound, err.Error()) + return fmt.Errorf("%s: %w", err.Error(), errdefs.ErrNotFound) } } return nil @@ -182,16 +183,16 @@ func (e *execProcess) start(ctx context.Context) (err error) { ) if e.stdio.Terminal { if socket, err = runc.NewTempConsoleSocket(); err != nil { - return errors.Wrap(err, "failed to create runc console socket") + return fmt.Errorf("failed to create runc console socket: %w", err) } defer socket.Close() } else if e.stdio.IsNull() { if e.io, err = runc.NewNullIO(); err != nil { - return errors.Wrap(err, "creating new NULL IO") + return fmt.Errorf("creating new NULL IO: %w", err) } } else { if e.io, err = runc.NewPipeIO(e.parent.IoUID, e.parent.IoGID, withConditionalIO(e.stdio)); err != nil { - return errors.Wrap(err, "failed to create runc io pipes") + return fmt.Errorf("failed to create runc io pipes: %w", err) } } opts := &runsc.ExecOpts{ @@ -217,7 +218,7 @@ func (e *execProcess) start(ctx context.Context) (err error) { if e.stdio.Stdin != "" { sc, err := fifo.OpenFifo(context.Background(), e.stdio.Stdin, syscall.O_WRONLY|syscall.O_NONBLOCK, 0) if err != nil { - return errors.Wrapf(err, "failed to open stdin fifo %s", e.stdio.Stdin) + return fmt.Errorf("failed to open stdin fifo %s: %w", e.stdio.Stdin, err) } e.closers = append(e.closers, sc) e.stdin = sc @@ -228,25 +229,25 @@ func (e *execProcess) start(ctx context.Context) (err error) { if socket != nil { console, err := socket.ReceiveMaster() if err != nil { - return errors.Wrap(err, "failed to retrieve console master") + return fmt.Errorf("failed to retrieve console master: %w", err) } if e.console, err = e.parent.Platform.CopyConsole(ctx, console, e.stdio.Stdin, e.stdio.Stdout, e.stdio.Stderr, &e.wg, ©WaitGroup); err != nil { - return errors.Wrap(err, "failed to start console copy") + return fmt.Errorf("failed to start console copy: %w", err) } } else if !e.stdio.IsNull() { if err := copyPipes(ctx, e.io, e.stdio.Stdin, e.stdio.Stdout, e.stdio.Stderr, &e.wg, ©WaitGroup); err != nil { - return errors.Wrap(err, "failed to start io pipe copy") + return fmt.Errorf("failed to start io pipe copy: %w", err) } } copyWaitGroup.Wait() pid, err := runc.ReadPidFile(opts.PidFile) if err != nil { - return errors.Wrap(err, "failed to retrieve OCI runtime exec pid") + return fmt.Errorf("failed to retrieve OCI runtime exec pid: %w", err) } e.pid = pid internalPid, err := runc.ReadPidFile(opts.InternalPidFile) if err != nil { - return errors.Wrap(err, "failed to retrieve OCI runtime exec internal pid") + return fmt.Errorf("failed to retrieve OCI runtime exec internal pid: %w", err) } e.internalPid = internalPid go func() { diff --git a/pkg/shim/v1/proc/exec_state.go b/pkg/shim/v1/proc/exec_state.go index 5416cb601..4dcda8b44 100644 --- a/pkg/shim/v1/proc/exec_state.go +++ b/pkg/shim/v1/proc/exec_state.go @@ -17,9 +17,9 @@ package proc import ( "context" + "fmt" "github.com/containerd/console" - "github.com/pkg/errors" ) type execState interface { @@ -43,7 +43,7 @@ func (s *execCreatedState) transition(name string) error { case "deleted": s.p.execState = &deletedState{} default: - return errors.Errorf("invalid state transition %q to %q", stateName(s), name) + return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) } return nil } @@ -87,7 +87,7 @@ func (s *execRunningState) transition(name string) error { case "stopped": s.p.execState = &execStoppedState{p: s.p} default: - return errors.Errorf("invalid state transition %q to %q", stateName(s), name) + return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) } return nil } @@ -97,11 +97,11 @@ func (s *execRunningState) Resize(ws console.WinSize) error { } func (s *execRunningState) Start(ctx context.Context) error { - return errors.Errorf("cannot start a running process") + return fmt.Errorf("cannot start a running process") } func (s *execRunningState) Delete(ctx context.Context) error { - return errors.Errorf("cannot delete a running process") + return fmt.Errorf("cannot delete a running process") } func (s *execRunningState) Kill(ctx context.Context, sig uint32, all bool) error { @@ -125,17 +125,17 @@ func (s *execStoppedState) transition(name string) error { case "deleted": s.p.execState = &deletedState{} default: - return errors.Errorf("invalid state transition %q to %q", stateName(s), name) + return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) } return nil } func (s *execStoppedState) Resize(ws console.WinSize) error { - return errors.Errorf("cannot resize a stopped container") + return fmt.Errorf("cannot resize a stopped container") } func (s *execStoppedState) Start(ctx context.Context) error { - return errors.Errorf("cannot start a stopped process") + return fmt.Errorf("cannot start a stopped process") } func (s *execStoppedState) Delete(ctx context.Context) error { diff --git a/pkg/shim/v1/proc/init.go b/pkg/shim/v1/proc/init.go index 0771540a9..b429cb94f 100644 --- a/pkg/shim/v1/proc/init.go +++ b/pkg/shim/v1/proc/init.go @@ -18,6 +18,7 @@ package proc import ( "context" "encoding/json" + "fmt" "io" "path/filepath" "strings" @@ -33,23 +34,22 @@ import ( "github.com/containerd/fifo" runc "github.com/containerd/go-runc" specs "github.com/opencontainers/runtime-spec/specs-go" - "github.com/pkg/errors" "gvisor.dev/gvisor/pkg/shim/runsc" ) -// InitPidFile name of the file that contains the init pid +// InitPidFile name of the file that contains the init pid. const InitPidFile = "init.pid" -// Init represents an initial process for a container +// Init represents an initial process for a container. type Init struct { wg sync.WaitGroup initState initState // mu is used to ensure that `Start()` and `Exited()` calls return in - // the right order when invoked in separate go routines. - // This is the case within the shim implementation as it makes use of - // the reaper interface. + // the right order when invoked in separate go routines. This is the + // case within the shim implementation as it makes use of the reaper + // interface. mu sync.Mutex waitBlock chan struct{} @@ -76,7 +76,7 @@ type Init struct { Monitor ProcessMonitor } -// NewRunsc returns a new runsc instance for a process +// NewRunsc returns a new runsc instance for a process. func NewRunsc(root, path, namespace, runtime string, config map[string]string) *runsc.Runsc { if root == "" { root = RunscRoot @@ -91,7 +91,7 @@ func NewRunsc(root, path, namespace, runtime string, config map[string]string) * } } -// New returns a new init process +// New returns a new init process. func New(id string, runtime *runsc.Runsc, stdio proc.Stdio) *Init { p := &Init{ id: id, @@ -104,21 +104,21 @@ func New(id string, runtime *runsc.Runsc, stdio proc.Stdio) *Init { return p } -// Create the process with the provided config +// Create the process with the provided config. func (p *Init) Create(ctx context.Context, r *CreateConfig) (err error) { var socket *runc.Socket if r.Terminal { if socket, err = runc.NewTempConsoleSocket(); err != nil { - return errors.Wrap(err, "failed to create OCI runtime console socket") + return fmt.Errorf("failed to create OCI runtime console socket: %w", err) } defer socket.Close() } else if hasNoIO(r) { if p.io, err = runc.NewNullIO(); err != nil { - return errors.Wrap(err, "creating new NULL IO") + return fmt.Errorf("creating new NULL IO: %w", err) } } else { if p.io, err = runc.NewPipeIO(p.IoUID, p.IoGID, withConditionalIO(p.stdio)); err != nil { - return errors.Wrap(err, "failed to create OCI runtime io pipes") + return fmt.Errorf("failed to create OCI runtime io pipes: %w", err) } } pidFile := filepath.Join(p.Bundle, InitPidFile) @@ -139,7 +139,7 @@ func (p *Init) Create(ctx context.Context, r *CreateConfig) (err error) { if r.Stdin != "" { sc, err := fifo.OpenFifo(context.Background(), r.Stdin, syscall.O_WRONLY|syscall.O_NONBLOCK, 0) if err != nil { - return errors.Wrapf(err, "failed to open stdin fifo %s", r.Stdin) + return fmt.Errorf("failed to open stdin fifo %s: %w", r.Stdin, err) } p.stdin = sc p.closers = append(p.closers, sc) @@ -150,58 +150,58 @@ func (p *Init) Create(ctx context.Context, r *CreateConfig) (err error) { if socket != nil { console, err := socket.ReceiveMaster() if err != nil { - return errors.Wrap(err, "failed to retrieve console master") + return fmt.Errorf("failed to retrieve console master: %w", err) } console, err = p.Platform.CopyConsole(ctx, console, r.Stdin, r.Stdout, r.Stderr, &p.wg, ©WaitGroup) if err != nil { - return errors.Wrap(err, "failed to start console copy") + return fmt.Errorf("failed to start console copy: %w", err) } p.console = console } else if !hasNoIO(r) { if err := copyPipes(ctx, p.io, r.Stdin, r.Stdout, r.Stderr, &p.wg, ©WaitGroup); err != nil { - return errors.Wrap(err, "failed to start io pipe copy") + return fmt.Errorf("failed to start io pipe copy: %w", err) } } copyWaitGroup.Wait() pid, err := runc.ReadPidFile(pidFile) if err != nil { - return errors.Wrap(err, "failed to retrieve OCI runtime container pid") + return fmt.Errorf("failed to retrieve OCI runtime container pid: %w", err) } p.pid = pid return nil } -// Wait for the process to exit +// Wait waits for the process to exit. func (p *Init) Wait() { <-p.waitBlock } -// ID of the process +// ID returns the ID of the process. func (p *Init) ID() string { return p.id } -// Pid of the process +// Pid returns the PID of the process. func (p *Init) Pid() int { return p.pid } -// ExitStatus of the process +// ExitStatus returns the exit status of the process. func (p *Init) ExitStatus() int { p.mu.Lock() defer p.mu.Unlock() return p.status } -// ExitedAt at time when the process exited +// ExitedAt returns the time when the process exited. func (p *Init) ExitedAt() time.Time { p.mu.Lock() defer p.mu.Unlock() return p.exited } -// Status of the process +// Status returns the status of the process. func (p *Init) Status(ctx context.Context) (string, error) { p.mu.Lock() defer p.mu.Unlock() @@ -215,7 +215,7 @@ func (p *Init) Status(ctx context.Context) (string, error) { return p.convertStatus(c.Status), nil } -// Start the init process +// Start starts the init process. func (p *Init) Start(ctx context.Context) error { p.mu.Lock() defer p.mu.Unlock() @@ -250,7 +250,7 @@ func (p *Init) start(ctx context.Context) error { return nil } -// SetExited of the init process with the next status +// SetExited set the exit stauts of the init process. func (p *Init) SetExited(status int) { p.mu.Lock() defer p.mu.Unlock() @@ -265,7 +265,7 @@ func (p *Init) setExited(status int) { close(p.waitBlock) } -// Delete the init process +// Delete deletes the init process. func (p *Init) Delete(ctx context.Context) error { p.mu.Lock() defer p.mu.Unlock() @@ -298,13 +298,13 @@ func (p *Init) delete(ctx context.Context) error { if err2 := mount.UnmountAll(p.Rootfs, 0); err2 != nil { log.G(ctx).WithError(err2).Warn("failed to cleanup rootfs mount") if err == nil { - err = errors.Wrap(err2, "failed rootfs umount") + err = fmt.Errorf("failed rootfs umount: %w", err2) } } return err } -// Resize the init processes console +// Resize resizes the init processes console. func (p *Init) Resize(ws console.WinSize) error { p.mu.Lock() defer p.mu.Unlock() @@ -322,7 +322,7 @@ func (p *Init) resize(ws console.WinSize) error { return p.console.Resize(ws) } -// Kill the init process +// Kill kills the init process. func (p *Init) Kill(ctx context.Context, signal uint32, all bool) error { p.mu.Lock() defer p.mu.Unlock() @@ -340,7 +340,7 @@ func (p *Init) kill(context context.Context, signal uint32, all bool) error { c, err := p.runtime.State(context, p.id) if err != nil { if strings.Contains(err.Error(), "does not exist") { - return errors.Wrapf(errdefs.ErrNotFound, "no such process") + return fmt.Errorf("no such process: %w", errdefs.ErrNotFound) } return p.runtimeError(err, "OCI runtime state failed") } @@ -348,7 +348,7 @@ func (p *Init) kill(context context.Context, signal uint32, all bool) error { // If the container is not in running state, directly return // "no such process" if p.convertStatus(c.Status) == "stopped" { - return errors.Wrapf(errdefs.ErrNotFound, "no such process") + return fmt.Errorf("no such process: %w", errdefs.ErrNotFound) } killErr = p.runtime.Kill(context, p.id, int(signal), &runsc.KillOpts{ All: all, @@ -362,7 +362,7 @@ func (p *Init) kill(context context.Context, signal uint32, all bool) error { return p.runtimeError(killErr, "kill timeout") } -// KillAll processes belonging to the init process +// KillAll kills all processes belonging to the init process. func (p *Init) KillAll(context context.Context) error { p.mu.Lock() defer p.mu.Unlock() @@ -380,17 +380,17 @@ func (p *Init) killAll(context context.Context) error { return nil } -// Stdin of the process +// Stdin returns the stdin of the process. func (p *Init) Stdin() io.Closer { return p.stdin } -// Runtime returns the OCI runtime configured for the init process +// Runtime returns the OCI runtime configured for the init process. func (p *Init) Runtime() *runsc.Runsc { return p.runtime } -// Exec returns a new child process +// Exec returns a new child process. func (p *Init) Exec(ctx context.Context, path string, r *ExecConfig) (proc.Process, error) { p.mu.Lock() defer p.mu.Unlock() @@ -398,7 +398,7 @@ func (p *Init) Exec(ctx context.Context, path string, r *ExecConfig) (proc.Proce return p.initState.Exec(ctx, path, r) } -// exec returns a new exec'd process +// exec returns a new exec'd process. func (p *Init) exec(ctx context.Context, path string, r *ExecConfig) (proc.Process, error) { // process exec request var spec specs.Process @@ -424,7 +424,7 @@ func (p *Init) exec(ctx context.Context, path string, r *ExecConfig) (proc.Proce return e, nil } -// Stdio of the process +// Stdio returns the stdio of the process. func (p *Init) Stdio() proc.Stdio { return p.stdio } @@ -437,11 +437,11 @@ func (p *Init) runtimeError(rErr error, msg string) error { rMsg, err := getLastRuntimeError(p.runtime) switch { case err != nil: - return errors.Wrapf(rErr, "%s: %s (%s)", msg, "unable to retrieve OCI runtime error", err.Error()) + return fmt.Errorf("%s: %w (unable to retrieve OCI runtime error: %v)", msg, rErr, err) case rMsg == "": - return errors.Wrap(rErr, msg) + return fmt.Errorf("%s: %w", msg, rErr) default: - return errors.Errorf("%s: %s", msg, rMsg) + return fmt.Errorf("%s: %s", msg, rMsg) } } diff --git a/pkg/shim/v1/proc/init_state.go b/pkg/shim/v1/proc/init_state.go index 868646b6c..509f27762 100644 --- a/pkg/shim/v1/proc/init_state.go +++ b/pkg/shim/v1/proc/init_state.go @@ -17,11 +17,11 @@ package proc import ( "context" + "fmt" "github.com/containerd/console" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/runtime/proc" - "github.com/pkg/errors" ) type initState interface { @@ -46,7 +46,7 @@ func (s *createdState) transition(name string) error { case "deleted": s.p.initState = &deletedState{} default: - return errors.Errorf("invalid state transition %q to %q", stateName(s), name) + return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) } return nil } @@ -107,7 +107,7 @@ func (s *runningState) transition(name string) error { case "stopped": s.p.initState = &stoppedState{p: s.p} default: - return errors.Errorf("invalid state transition %q to %q", stateName(s), name) + return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) } return nil } @@ -117,11 +117,11 @@ func (s *runningState) Resize(ws console.WinSize) error { } func (s *runningState) Start(ctx context.Context) error { - return errors.Errorf("cannot start a running process") + return fmt.Errorf("cannot start a running process") } func (s *runningState) Delete(ctx context.Context) error { - return errors.Errorf("cannot delete a running process") + return fmt.Errorf("cannot delete a running process") } func (s *runningState) Kill(ctx context.Context, sig uint32, all bool) error { @@ -149,17 +149,17 @@ func (s *stoppedState) transition(name string) error { case "deleted": s.p.initState = &deletedState{} default: - return errors.Errorf("invalid state transition %q to %q", stateName(s), name) + return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) } return nil } func (s *stoppedState) Resize(ws console.WinSize) error { - return errors.Errorf("cannot resize a stopped container") + return fmt.Errorf("cannot resize a stopped container") } func (s *stoppedState) Start(ctx context.Context) error { - return errors.Errorf("cannot start a stopped process") + return fmt.Errorf("cannot start a stopped process") } func (s *stoppedState) Delete(ctx context.Context) error { @@ -178,5 +178,5 @@ func (s *stoppedState) SetExited(status int) { } func (s *stoppedState) Exec(ctx context.Context, path string, r *ExecConfig) (proc.Process, error) { - return nil, errors.Errorf("cannot exec in a stopped state") + return nil, fmt.Errorf("cannot exec in a stopped state") } diff --git a/pkg/shim/v1/proc/io.go b/pkg/shim/v1/proc/io.go index 2677b4e54..5313c7a50 100644 --- a/pkg/shim/v1/proc/io.go +++ b/pkg/shim/v1/proc/io.go @@ -150,8 +150,9 @@ func (c *countingWriteCloser) Close() error { return c.WriteCloser.Close() } -// isFifo checks if a file is a fifo -// if the file does not exist then it returns false +// isFifo checks if a file is a fifo. +// +// If the file does not exist then it returns false. func isFifo(path string) (bool, error) { stat, err := os.Stat(path) if err != nil { diff --git a/pkg/shim/v1/proc/process.go b/pkg/shim/v1/proc/process.go index 1bfa99f4c..d462c3eef 100644 --- a/pkg/shim/v1/proc/process.go +++ b/pkg/shim/v1/proc/process.go @@ -16,10 +16,10 @@ package proc import ( - "github.com/pkg/errors" + "fmt" ) -// RunscRoot is the path to the root runsc state directory +// RunscRoot is the path to the root runsc state directory. const RunscRoot = "/run/containerd/runsc" func stateName(v interface{}) string { @@ -33,5 +33,5 @@ func stateName(v interface{}) string { case *stoppedState: return "stopped" } - panic(errors.Errorf("invalid state %v", v)) + panic(fmt.Errorf("invalid state %v", v)) } diff --git a/pkg/shim/v1/proc/types.go b/pkg/shim/v1/proc/types.go index dcd43bcca..5c215de5f 100644 --- a/pkg/shim/v1/proc/types.go +++ b/pkg/shim/v1/proc/types.go @@ -23,7 +23,7 @@ import ( runc "github.com/containerd/go-runc" ) -// Mount holds filesystem mount configuration +// Mount holds filesystem mount configuration. type Mount struct { Type string Source string @@ -31,7 +31,7 @@ type Mount struct { Options []string } -// CreateConfig hold task creation configuration +// CreateConfig hold task creation configuration. type CreateConfig struct { ID string Bundle string @@ -44,7 +44,7 @@ type CreateConfig struct { Options *google_protobuf.Any } -// ExecConfig holds exec creation configuration +// ExecConfig holds exec creation configuration. type ExecConfig struct { ID string Terminal bool @@ -54,14 +54,14 @@ type ExecConfig struct { Spec *google_protobuf.Any } -// Exit is the type of exit events +// Exit is the type of exit events. type Exit struct { Timestamp time.Time ID string Status int } -// ProcessMonitor monitors process exit changes +// ProcessMonitor monitors process exit changes. type ProcessMonitor interface { // Subscribe to process exit changes Subscribe() chan runc.Exit diff --git a/pkg/shim/v1/proc/utils.go b/pkg/shim/v1/proc/utils.go index 6e7dbdad7..716de2f59 100644 --- a/pkg/shim/v1/proc/utils.go +++ b/pkg/shim/v1/proc/utils.go @@ -34,8 +34,6 @@ const ( // inside the sandbox. var ExitCh = make(chan Exit, bufferSize) -// TODO(random-liu): This can be a utility. - // TODO(mlaventure): move to runc package? func getLastRuntimeError(r *runsc.Runsc) (string, error) { if r.Log == "" { diff --git a/pkg/shim/v1/shim/BUILD b/pkg/shim/v1/shim/BUILD new file mode 100644 index 000000000..02cffbb01 --- /dev/null +++ b/pkg/shim/v1/shim/BUILD @@ -0,0 +1,38 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "shim", + srcs = [ + "platform.go", + "service.go", + ], + visibility = [ + "//pkg/shim:__subpackages__", + "//shim:__subpackages__", + ], + deps = [ + "//pkg/shim/runsc", + "//pkg/shim/v1/proc", + "//pkg/shim/v1/utils", + "@com_github_containerd_console//:go_default_library", + "@com_github_containerd_containerd//api/events:go_default_library", + "@com_github_containerd_containerd//api/types/task:go_default_library", + "@com_github_containerd_containerd//errdefs:go_default_library", + "@com_github_containerd_containerd//events:go_default_library", + "@com_github_containerd_containerd//log:go_default_library", + "@com_github_containerd_containerd//mount:go_default_library", + "@com_github_containerd_containerd//namespaces:go_default_library", + "@com_github_containerd_containerd//runtime:go_default_library", + "@com_github_containerd_containerd//runtime/linux/runctypes:go_default_library", + "@com_github_containerd_containerd//runtime/proc:go_default_library", + "@com_github_containerd_containerd//runtime/v1/shim:go_default_library", + "@com_github_containerd_containerd//runtime/v1/shim/v1:go_default_library", + "@com_github_containerd_fifo//:go_default_library", + "@com_github_containerd_typeurl//:go_default_library", + "@com_github_gogo_protobuf//types:go_default_library", + "@org_golang_google_grpc//codes:go_default_library", + "@org_golang_google_grpc//status:go_default_library", + ], +) diff --git a/pkg/shim/v1/shim/platform.go b/pkg/shim/v1/shim/platform.go index 86252c3f5..f6795563c 100644 --- a/pkg/shim/v1/shim/platform.go +++ b/pkg/shim/v1/shim/platform.go @@ -17,13 +17,13 @@ package shim import ( "context" + "fmt" "io" "sync" "syscall" "github.com/containerd/console" "github.com/containerd/fifo" - "github.com/pkg/errors" ) type linuxPlatform struct { @@ -32,7 +32,7 @@ type linuxPlatform struct { func (p *linuxPlatform) CopyConsole(ctx context.Context, console console.Console, stdin, stdout, stderr string, wg, cwg *sync.WaitGroup) (console.Console, error) { if p.epoller == nil { - return nil, errors.New("uninitialized epoller") + return nil, fmt.Errorf("uninitialized epoller") } epollConsole, err := p.epoller.Add(console) @@ -79,11 +79,11 @@ func (p *linuxPlatform) CopyConsole(ctx context.Context, console console.Console func (p *linuxPlatform) ShutdownConsole(ctx context.Context, cons console.Console) error { if p.epoller == nil { - return errors.New("uninitialized epoller") + return fmt.Errorf("uninitialized epoller") } epollConsole, ok := cons.(*console.EpollConsole) if !ok { - return errors.Errorf("expected EpollConsole, got %#v", cons) + return fmt.Errorf("expected EpollConsole, got %#v", cons) } return epollConsole.Shutdown(p.epoller.CloseConsole) } @@ -100,7 +100,7 @@ func (s *Service) initPlatform() error { } epoller, err := console.NewEpoller() if err != nil { - return errors.Wrap(err, "failed to initialize epoller") + return fmt.Errorf("failed to initialize epoller: %w", err) } s.platform = &linuxPlatform{ epoller: epoller, diff --git a/pkg/shim/v1/shim/service.go b/pkg/shim/v1/shim/service.go index aac172801..7b0280ed2 100644 --- a/pkg/shim/v1/shim/service.go +++ b/pkg/shim/v1/shim/service.go @@ -37,8 +37,6 @@ import ( shimapi "github.com/containerd/containerd/runtime/v1/shim/v1" "github.com/containerd/typeurl" ptypes "github.com/gogo/protobuf/types" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -57,7 +55,7 @@ var ( } ) -// Config contains shim specific configuration +// Config contains shim specific configuration. type Config struct { Path string Namespace string @@ -66,17 +64,12 @@ type Config struct { RunscConfig map[string]string } -// NewService returns a new shim service that can be used via GRPC +// NewService returns a new shim service that can be used via GRPC. func NewService(config Config, publisher events.Publisher) (*Service, error) { if config.Namespace == "" { return nil, fmt.Errorf("shim namespace cannot be empty") } ctx := namespaces.WithNamespace(context.Background(), config.Namespace) - ctx = log.WithLogger(ctx, logrus.WithFields(logrus.Fields{ - "namespace": config.Namespace, - "path": config.Path, - "pid": os.Getpid(), - })) s := &Service{ config: config, context: ctx, @@ -86,13 +79,13 @@ func NewService(config Config, publisher events.Publisher) (*Service, error) { } go s.processExits() if err := s.initPlatform(); err != nil { - return nil, errors.Wrap(err, "failed to initialized platform behavior") + return nil, fmt.Errorf("failed to initialized platform behavior: %w", err) } go s.forward(publisher) return s, nil } -// Service is the shim implementation of a remote shim over GRPC +// Service is the shim implementation of a remote shim over GRPC. type Service struct { mu sync.Mutex @@ -108,7 +101,7 @@ type Service struct { bundle string } -// Create a new initial process and container with the underlying OCI runtime +// Create creates a new initial process and container with the underlying OCI runtime. func (s *Service) Create(ctx context.Context, r *shimapi.CreateTaskRequest) (_ *shimapi.CreateTaskResponse, err error) { s.mu.Lock() defer s.mu.Unlock() @@ -153,7 +146,7 @@ func (s *Service) Create(ctx context.Context, r *shimapi.CreateTaskRequest) (_ * Options: rm.Options, } if err := m.Mount(rootfs); err != nil { - return nil, errors.Wrapf(err, "failed to mount rootfs component %v", m) + return nil, fmt.Errorf("failed to mount rootfs component %v: %w", m, err) } } process, err := newInit( @@ -169,7 +162,8 @@ func (s *Service) Create(ctx context.Context, r *shimapi.CreateTaskRequest) (_ * if err := process.Create(ctx, config); err != nil { return nil, errdefs.ToGRPC(err) } - // save the main task id and bundle to the shim for additional requests + // Save the main task id and bundle to the shim for additional + // requests. s.id = r.ID s.bundle = r.Bundle pid := process.Pid() @@ -179,7 +173,7 @@ func (s *Service) Create(ctx context.Context, r *shimapi.CreateTaskRequest) (_ * }, nil } -// Start a process +// Start starts a process. func (s *Service) Start(ctx context.Context, r *shimapi.StartRequest) (*shimapi.StartResponse, error) { p, err := s.getExecProcess(r.ID) if err != nil { @@ -194,7 +188,7 @@ func (s *Service) Start(ctx context.Context, r *shimapi.StartRequest) (*shimapi. }, nil } -// Delete the initial process and container +// Delete deletes the initial process and container. func (s *Service) Delete(ctx context.Context, r *ptypes.Empty) (*shimapi.DeleteResponse, error) { p, err := s.getInitProcess() if err != nil { @@ -214,7 +208,7 @@ func (s *Service) Delete(ctx context.Context, r *ptypes.Empty) (*shimapi.DeleteR }, nil } -// DeleteProcess deletes an exec'd process +// DeleteProcess deletes an exec'd process. func (s *Service) DeleteProcess(ctx context.Context, r *shimapi.DeleteProcessRequest) (*shimapi.DeleteResponse, error) { if r.ID == s.id { return nil, status.Errorf(codes.InvalidArgument, "cannot delete init process with DeleteProcess") @@ -236,7 +230,7 @@ func (s *Service) DeleteProcess(ctx context.Context, r *shimapi.DeleteProcessReq }, nil } -// Exec an additional process inside the container +// Exec spawns an additional process inside the container. func (s *Service) Exec(ctx context.Context, r *shimapi.ExecProcessRequest) (*ptypes.Empty, error) { s.mu.Lock() @@ -268,7 +262,7 @@ func (s *Service) Exec(ctx context.Context, r *shimapi.ExecProcessRequest) (*pty return empty, nil } -// ResizePty of a process +// ResizePty resises the terminal of a process. func (s *Service) ResizePty(ctx context.Context, r *shimapi.ResizePtyRequest) (*ptypes.Empty, error) { if r.ID == "" { return nil, errdefs.ToGRPCf(errdefs.ErrInvalidArgument, "id not provided") @@ -287,7 +281,7 @@ func (s *Service) ResizePty(ctx context.Context, r *shimapi.ResizePtyRequest) (* return empty, nil } -// State returns runtime state information for a process +// State returns runtime state information for a process. func (s *Service) State(ctx context.Context, r *shimapi.StateRequest) (*shimapi.StateResponse, error) { p, err := s.getExecProcess(r.ID) if err != nil { @@ -321,17 +315,17 @@ func (s *Service) State(ctx context.Context, r *shimapi.StateRequest) (*shimapi. }, nil } -// Pause the container +// Pause pauses the container. func (s *Service) Pause(ctx context.Context, r *ptypes.Empty) (*ptypes.Empty, error) { return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) } -// Resume the container +// Resume resumes the container. func (s *Service) Resume(ctx context.Context, r *ptypes.Empty) (*ptypes.Empty, error) { return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) } -// Kill a process with the provided signal +// Kill kills a process with the provided signal. func (s *Service) Kill(ctx context.Context, r *shimapi.KillRequest) (*ptypes.Empty, error) { if r.ID == "" { p, err := s.getInitProcess() @@ -354,7 +348,7 @@ func (s *Service) Kill(ctx context.Context, r *shimapi.KillRequest) (*ptypes.Emp return empty, nil } -// ListPids returns all pids inside the container +// ListPids returns all pids inside the container. func (s *Service) ListPids(ctx context.Context, r *shimapi.ListPidsRequest) (*shimapi.ListPidsResponse, error) { pids, err := s.getContainerPids(ctx, r.ID) if err != nil { @@ -372,7 +366,7 @@ func (s *Service) ListPids(ctx context.Context, r *shimapi.ListPidsRequest) (*sh } a, err := typeurl.MarshalAny(d) if err != nil { - return nil, errors.Wrapf(err, "failed to marshal process %d info", pid) + return nil, fmt.Errorf("failed to marshal process %d info: %w", pid, err) } pInfo.Info = a break @@ -385,7 +379,7 @@ func (s *Service) ListPids(ctx context.Context, r *shimapi.ListPidsRequest) (*sh }, nil } -// CloseIO of a process +// CloseIO closes the I/O context of a process. func (s *Service) CloseIO(ctx context.Context, r *shimapi.CloseIORequest) (*ptypes.Empty, error) { p, err := s.getExecProcess(r.ID) if err != nil { @@ -393,30 +387,30 @@ func (s *Service) CloseIO(ctx context.Context, r *shimapi.CloseIORequest) (*ptyp } if stdin := p.Stdin(); stdin != nil { if err := stdin.Close(); err != nil { - return nil, errors.Wrap(err, "close stdin") + return nil, fmt.Errorf("close stdin: %w", err) } } return empty, nil } -// Checkpoint the container +// Checkpoint checkpoints the container. func (s *Service) Checkpoint(ctx context.Context, r *shimapi.CheckpointTaskRequest) (*ptypes.Empty, error) { return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) } -// ShimInfo returns shim information such as the shim's pid +// ShimInfo returns shim information such as the shim's pid. func (s *Service) ShimInfo(ctx context.Context, r *ptypes.Empty) (*shimapi.ShimInfoResponse, error) { return &shimapi.ShimInfoResponse{ ShimPid: uint32(os.Getpid()), }, nil } -// Update a running container +// Update updates a running container. func (s *Service) Update(ctx context.Context, r *shimapi.UpdateTaskRequest) (*ptypes.Empty, error) { return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) } -// Wait for a process to exit +// Wait waits for a process to exit. func (s *Service) Wait(ctx context.Context, r *shimapi.WaitRequest) (*shimapi.WaitResponse, error) { p, err := s.getExecProcess(r.ID) if err != nil { @@ -451,7 +445,7 @@ func (s *Service) checkProcesses(e proc.Exit) { for _, p := range s.allProcesses() { if p.ID() == e.ID { if ip, ok := p.(*proc.Init); ok { - // Ensure all children are killed + // Ensure all children are killed. if err := ip.KillAll(s.context); err != nil { log.G(s.context).WithError(err).WithField("id", ip.ID()). Error("failed to kill init's children") @@ -495,7 +489,7 @@ func (s *Service) forward(publisher events.Publisher) { } } -// getInitProcess returns initial process +// getInitProcess returns the init process. func (s *Service) getInitProcess() (rproc.Process, error) { s.mu.Lock() defer s.mu.Unlock() @@ -506,7 +500,7 @@ func (s *Service) getInitProcess() (rproc.Process, error) { return p, nil } -// getExecProcess returns exec process +// getExecProcess returns the given exec process. func (s *Service) getExecProcess(id string) (rproc.Process, error) { s.mu.Lock() defer s.mu.Unlock() @@ -534,7 +528,7 @@ func getTopic(ctx context.Context, e interface{}) string { case *eventstypes.TaskExecStarted: return runtime.TaskExecStartedEventTopic default: - logrus.Warnf("no topic for type %#v", e) + log.L.Printf("no topic for type %#v", e) } return runtime.TaskUnknownTopic } @@ -551,10 +545,10 @@ func newInit(ctx context.Context, path, workDir, runtimeRoot, namespace string, spec, err := utils.ReadSpec(r.Bundle) if err != nil { - return nil, errors.Wrap(err, "read oci spec") + return nil, fmt.Errorf("read oci spec: %w", err) } if err := utils.UpdateVolumeAnnotations(r.Bundle, spec); err != nil { - return nil, errors.Wrap(err, "update volume annotations") + return nil, fmt.Errorf("update volume annotations: %w", err) } runsc.FormatLogPath(r.ID, config) diff --git a/pkg/shim/v1/utils/BUILD b/pkg/shim/v1/utils/BUILD new file mode 100644 index 000000000..9045781e1 --- /dev/null +++ b/pkg/shim/v1/utils/BUILD @@ -0,0 +1,26 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "utils", + srcs = [ + "utils.go", + "volumes.go", + ], + visibility = [ + "//pkg/shim:__subpackages__", + "//shim:__subpackages__", + ], + deps = [ + "@com_github_containerd_cri//pkg/annotations:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + ], +) + +go_test( + name = "utils_test", + size = "small", + srcs = ["volumes_test.go"], + library = ":utils", +) diff --git a/pkg/shim/v1/utils/volumes.go b/pkg/shim/v1/utils/volumes.go index 7323e7245..e4e9bf9b1 100644 --- a/pkg/shim/v1/utils/volumes.go +++ b/pkg/shim/v1/utils/volumes.go @@ -23,8 +23,6 @@ import ( "github.com/containerd/cri/pkg/annotations" specs "github.com/opencontainers/runtime-spec/specs-go" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" ) const volumeKeyPrefix = "dev.gvisor.spec.mount." @@ -32,13 +30,13 @@ const volumeKeyPrefix = "dev.gvisor.spec.mount." var kubeletPodsDir = "/var/lib/kubelet/pods" // volumeName gets volume name from volume annotation key, example: -// dev.gvisor.spec.mount.NAME.share +// dev.gvisor.spec.mount.NAME.share func volumeName(k string) string { return strings.SplitN(strings.TrimPrefix(k, volumeKeyPrefix), ".", 2)[0] } // volumeFieldName gets volume field name from volume annotation key, example: -// `type` is the field of dev.gvisor.spec.mount.NAME.type +// `type` is the field of dev.gvisor.spec.mount.NAME.type func volumeFieldName(k string) string { parts := strings.Split(strings.TrimPrefix(k, volumeKeyPrefix), ".") return parts[len(parts)-1] @@ -48,16 +46,16 @@ func volumeFieldName(k string) string { func podUID(s *specs.Spec) (string, error) { sandboxLogDir := s.Annotations[annotations.SandboxLogDir] if sandboxLogDir == "" { - return "", errors.New("no sandbox log path annotation") + return "", fmt.Errorf("no sandbox log path annotation") } fields := strings.Split(filepath.Base(sandboxLogDir), "_") switch len(fields) { - case 1: // This is the old CRI logging path + case 1: // This is the old CRI logging path. return fields[0], nil - case 3: // This is the new CRI logging path + case 3: // This is the new CRI logging path. return fields[2], nil } - return "", errors.Errorf("unexpected sandbox log path %q", sandboxLogDir) + return "", fmt.Errorf("unexpected sandbox log path %q", sandboxLogDir) } // isVolumeKey checks whether an annotation key is for volume. @@ -79,7 +77,7 @@ func volumePath(volume, uid string) (string, error) { return "", err } if len(dirs) != 1 { - return "", errors.Errorf("unexpected matched volume list %v", dirs) + return "", fmt.Errorf("unexpected matched volume list %v", dirs) } return dirs[0], nil } @@ -103,7 +101,7 @@ func UpdateVolumeAnnotations(bundle string, s *specs.Spec) error { if err != nil { // Skip if we can't get pod UID, because this doesn't work // for containerd 1.1. - logrus.WithError(err).Error("Can't get pod uid") + fmt.Errorf("Can't get pod uid: %w", err) return nil } } @@ -117,22 +115,25 @@ func UpdateVolumeAnnotations(bundle string, s *specs.Spec) error { } volume := volumeName(k) if uid != "" { - // This is a sandbox + // This is a sandbox. path, err := volumePath(volume, uid) if err != nil { - return errors.Wrapf(err, "get volume path for %q", volume) + return fmt.Errorf("get volume path for %q: %w", volume, err) } s.Annotations[volumeSourceKey(volume)] = path updated = true } else { - // This is a container + // This is a container. for i := range s.Mounts { - // An error is returned for sandbox if source annotation - // is not successfully applied, so it is guaranteed that - // the source annotation for sandbox has already been - // successfully applied at this point. - // The volume name is unique inside a pod, so matching without - // podUID is fine here. + // An error is returned for sandbox if source + // annotation is not successfully applied, so + // it is guaranteed that the source annotation + // for sandbox has already been successfully + // applied at this point. + // + // The volume name is unique inside a pod, so + // matching without podUID is fine here. + // // TODO: Pass podUID down to shim for containers to do // more accurate matching. if yes, _ := isVolumePath(volume, s.Mounts[i].Source); yes { @@ -147,7 +148,7 @@ func UpdateVolumeAnnotations(bundle string, s *specs.Spec) error { if !updated { return nil } - // Update bundle + // Update bundle. b, err := json.Marshal(s) if err != nil { return err diff --git a/pkg/shim/v2/BUILD b/pkg/shim/v2/BUILD new file mode 100644 index 000000000..450f62979 --- /dev/null +++ b/pkg/shim/v2/BUILD @@ -0,0 +1,41 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "v2", + srcs = [ + "epoll.go", + "service.go", + "service_linux.go", + ], + visibility = ["//shim:__subpackages__"], + deps = [ + "//pkg/shim/runsc", + "//pkg/shim/v1/proc", + "//pkg/shim/v1/utils", + "//pkg/shim/v2/options", + "//runsc/specutils", + "@com_github_burntsushi_toml//:go_default_library", + "@com_github_containerd_cgroups//:go_default_library", + "@com_github_containerd_cgroups//stats/v1:go_default_library", + "@com_github_containerd_console//:go_default_library", + "@com_github_containerd_containerd//api/events:go_default_library", + "@com_github_containerd_containerd//api/types/task:go_default_library", + "@com_github_containerd_containerd//errdefs:go_default_library", + "@com_github_containerd_containerd//events:go_default_library", + "@com_github_containerd_containerd//log:go_default_library", + "@com_github_containerd_containerd//mount:go_default_library", + "@com_github_containerd_containerd//namespaces:go_default_library", + "@com_github_containerd_containerd//runtime:go_default_library", + "@com_github_containerd_containerd//runtime/linux/runctypes:go_default_library", + "@com_github_containerd_containerd//runtime/proc:go_default_library", + "@com_github_containerd_containerd//runtime/v2/shim:go_default_library", + "@com_github_containerd_containerd//runtime/v2/task:go_default_library", + "@com_github_containerd_cri//pkg/api/runtimeoptions/v1:go_default_library", + "@com_github_containerd_fifo//:go_default_library", + "@com_github_containerd_typeurl//:go_default_library", + "@com_github_gogo_protobuf//types:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/shim/v2/epoll.go b/pkg/shim/v2/epoll.go index 57a2c5452..45cc38c2a 100644 --- a/pkg/shim/v2/epoll.go +++ b/pkg/shim/v2/epoll.go @@ -19,13 +19,13 @@ package v2 import ( "context" + "fmt" "sync" "github.com/containerd/cgroups" eventstypes "github.com/containerd/containerd/api/events" "github.com/containerd/containerd/events" "github.com/containerd/containerd/runtime" - "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -71,7 +71,7 @@ func (e *epoller) run(ctx context.Context) { if err == unix.EINTR { continue } - logrus.WithError(err).Error("cgroups: epoll wait") + fmt.Errorf("cgroups: epoll wait: %w", err) } for i := 0; i < n; i++ { e.process(ctx, uintptr(events[i].Fd)) @@ -117,7 +117,7 @@ func (e *epoller) process(ctx context.Context, fd uintptr) { if err := e.publisher.Publish(ctx, runtime.TaskOOMEventTopic, &eventstypes.TaskOOM{ ContainerID: i.id, }); err != nil { - logrus.WithError(err).Error("publish OOM event") + fmt.Errorf("publish OOM event: %w", err) } } diff --git a/pkg/shim/v2/options/BUILD b/pkg/shim/v2/options/BUILD new file mode 100644 index 000000000..ca212e874 --- /dev/null +++ b/pkg/shim/v2/options/BUILD @@ -0,0 +1,11 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "options", + srcs = [ + "options.go", + ], + visibility = ["//:sandbox"], +) diff --git a/pkg/shim/v2/service.go b/pkg/shim/v2/service.go index 0cff82a89..c67b1beba 100644 --- a/pkg/shim/v2/service.go +++ b/pkg/shim/v2/service.go @@ -16,6 +16,7 @@ package v2 import ( "context" + "fmt" "io/ioutil" "os" "os/exec" @@ -26,6 +27,7 @@ import ( "github.com/BurntSushi/toml" "github.com/containerd/cgroups" + metrics "github.com/containerd/cgroups/stats/v1" "github.com/containerd/console" eventstypes "github.com/containerd/containerd/api/events" "github.com/containerd/containerd/api/types/task" @@ -42,14 +44,13 @@ import ( runtimeoptions "github.com/containerd/cri/pkg/api/runtimeoptions/v1" "github.com/containerd/typeurl" ptypes "github.com/gogo/protobuf/types" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/shim/runsc" "gvisor.dev/gvisor/pkg/shim/v1/proc" "gvisor.dev/gvisor/pkg/shim/v1/utils" "gvisor.dev/gvisor/pkg/shim/v2/options" + "gvisor.dev/gvisor/runsc/specutils" ) var ( @@ -68,7 +69,7 @@ var _ = (taskAPI.TaskService)(&service{}) // we assume that a config.toml should exist in the runtime root. const configFile = "config.toml" -// New returns a new shim service that can be used via GRPC +// New returns a new shim service that can be used via GRPC. func New(ctx context.Context, id string, publisher events.Publisher) (shim.Shim, error) { ep, err := newOOMEpoller(publisher) if err != nil { @@ -89,13 +90,13 @@ func New(ctx context.Context, id string, publisher events.Publisher) (shim.Shim, runsc.Monitor = shim.Default if err := s.initPlatform(); err != nil { cancel() - return nil, errors.Wrap(err, "failed to initialized platform behavior") + return nil, fmt.Errorf("failed to initialized platform behavior: %w", err) } go s.forward(publisher) return s, nil } -// service is the shim implementation of a remote shim over GRPC +// service is the shim implementation of a remote shim over GRPC. type service struct { mu sync.Mutex @@ -179,7 +180,7 @@ func (s *service) StartShim(ctx context.Context, id, containerdBinary, container return "", err } if err := shim.SetScore(cmd.Process.Pid); err != nil { - return "", errors.Wrap(err, "failed to set OOM Score on shim") + return "", fmt.Errorf("failed to set OOM Score on shim: %w", err) } return address, nil } @@ -201,10 +202,10 @@ func (s *service) Cleanup(ctx context.Context) (*taskAPI.DeleteResponse, error) if err := r.Delete(ctx, s.id, &runsc.DeleteOpts{ Force: true, }); err != nil { - logrus.WithError(err).Warn("failed to remove runc container") + log.L.Printf("failed to remove runc container: %v", err) } if err := mount.UnmountAll(filepath.Join(path, "rootfs"), 0); err != nil { - logrus.WithError(err).Warn("failed to cleanup rootfs mount") + log.L.Printf("failed to cleanup rootfs mount: %v", err) } return &taskAPI.DeleteResponse{ ExitedAt: time.Now(), @@ -224,14 +225,15 @@ func (s *service) writeRuntime(path, runtime string) error { return ioutil.WriteFile(filepath.Join(path, "runtime"), []byte(runtime), 0600) } -// Create a new initial process and container with the underlying OCI runtime +// Create creates a new initial process and container with the underlying OCI +// runtime. func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ *taskAPI.CreateTaskResponse, err error) { s.mu.Lock() defer s.mu.Unlock() ns, err := namespaces.NamespaceRequired(ctx) if err != nil { - return nil, errors.Wrap(err, "create namespace") + return nil, fmt.Errorf("create namespace: %w", err) } // Read from root for now. @@ -258,7 +260,7 @@ func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ * path = filepath.Join(root, configFile) if _, err := os.Stat(path); err != nil { if !os.IsNotExist(err) { - return nil, errors.Wrapf(err, "stat config file %q", path) + return nil, fmt.Errorf("stat config file %q: %w", path, err) } // A config file in runtime root is not required. path = "" @@ -268,15 +270,15 @@ func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ * break } if o.TypeUrl != options.OptionType { - return nil, errors.Errorf("unsupported runtimeoptions %q", o.TypeUrl) + return nil, fmt.Errorf("unsupported runtimeoptions %q", o.TypeUrl) } path = o.ConfigPath default: - return nil, errors.Errorf("unsupported option type %q", r.Options.TypeUrl) + return nil, fmt.Errorf("unsupported option type %q", r.Options.TypeUrl) } if path != "" { if _, err = toml.DecodeFile(path, &opts); err != nil { - return nil, errors.Wrapf(err, "decode config file %q", path) + return nil, fmt.Errorf("decode config file %q: %w", path, err) } } } @@ -312,8 +314,8 @@ func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ * } defer func() { if err != nil { - if err2 := mount.UnmountAll(rootfs, 0); err2 != nil { - logrus.WithError(err2).Warn("failed to cleanup rootfs mount") + if err := mount.UnmountAll(rootfs, 0); err != nil { + log.L.Printf("failed to cleanup rootfs mount: %v", err) } } }() @@ -324,7 +326,7 @@ func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ * Options: rm.Options, } if err := m.Mount(rootfs); err != nil { - return nil, errors.Wrapf(err, "failed to mount rootfs component %v", m) + return nil, fmt.Errorf("failed to mount rootfs component %v: %w", m, err) } } process, err := newInit( @@ -343,20 +345,21 @@ func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ * if err := process.Create(ctx, config); err != nil { return nil, errdefs.ToGRPC(err) } - // save the main task id and bundle to the shim for additional requests + // Save the main task id and bundle to the shim for additional + // requests. s.id = r.ID s.bundle = r.Bundle - // Set up OOM notification on the sandbox's cgroup. This is done on sandbox - // create since the sandbox process will be created here. + // Set up OOM notification on the sandbox's cgroup. This is done on + // sandbox create since the sandbox process will be created here. pid := process.Pid() if pid > 0 { cg, err := cgroups.Load(cgroups.V1, cgroups.PidPath(pid)) if err != nil { - return nil, errors.Wrapf(err, "loading cgroup for %d", pid) + return nil, fmt.Errorf("loading cgroup for %d: %w", pid, err) } if err := s.oomPoller.add(s.id, cg); err != nil { - return nil, errors.Wrapf(err, "add cg to OOM monitor") + return nil, fmt.Errorf("add cg to OOM monitor: %w", err) } } s.task = process @@ -367,7 +370,7 @@ func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ * } -// Start a process +// Start starts a process. func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.StartResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { @@ -383,7 +386,7 @@ func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI. }, nil } -// Delete the initial process and container +// Delete deletes the initial process and container. func (s *service) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAPI.DeleteResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { @@ -411,7 +414,7 @@ func (s *service) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAP }, nil } -// Exec an additional process inside the container +// Exec spawns an additional process inside the container. func (s *service) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*ptypes.Empty, error) { s.mu.Lock() p := s.processes[r.ExecID] @@ -440,7 +443,7 @@ func (s *service) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*pty return empty, nil } -// ResizePty of a process +// ResizePty resizes the terminal of a process. func (s *service) ResizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (*ptypes.Empty, error) { p, err := s.getProcess(r.ExecID) if err != nil { @@ -456,7 +459,7 @@ func (s *service) ResizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (* return empty, nil } -// State returns runtime state information for a process +// State returns runtime state information for a process. func (s *service) State(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI.StateResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { @@ -490,17 +493,17 @@ func (s *service) State(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI. }, nil } -// Pause the container +// Pause the container. func (s *service) Pause(ctx context.Context, r *taskAPI.PauseRequest) (*ptypes.Empty, error) { return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) } -// Resume the container +// Resume the container. func (s *service) Resume(ctx context.Context, r *taskAPI.ResumeRequest) (*ptypes.Empty, error) { return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) } -// Kill a process with the provided signal +// Kill a process with the provided signal. func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (*ptypes.Empty, error) { p, err := s.getProcess(r.ExecID) if err != nil { @@ -515,7 +518,7 @@ func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (*ptypes.Emp return empty, nil } -// Pids returns all pids inside the container +// Pids returns all pids inside the container. func (s *service) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.PidsResponse, error) { pids, err := s.getContainerPids(ctx, r.ID) if err != nil { @@ -533,7 +536,7 @@ func (s *service) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.Pi } a, err := typeurl.MarshalAny(d) if err != nil { - return nil, errors.Wrapf(err, "failed to marshal process %d info", pid) + return nil, fmt.Errorf("failed to marshal process %d info: %w", pid, err) } pInfo.Info = a break @@ -546,7 +549,7 @@ func (s *service) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.Pi }, nil } -// CloseIO of a process +// CloseIO closes the I/O context of a process. func (s *service) CloseIO(ctx context.Context, r *taskAPI.CloseIORequest) (*ptypes.Empty, error) { p, err := s.getProcess(r.ExecID) if err != nil { @@ -554,18 +557,18 @@ func (s *service) CloseIO(ctx context.Context, r *taskAPI.CloseIORequest) (*ptyp } if stdin := p.Stdin(); stdin != nil { if err := stdin.Close(); err != nil { - return nil, errors.Wrap(err, "close stdin") + return nil, fmt.Errorf("close stdin: %w", err) } } return empty, nil } -// Checkpoint the container +// Checkpoint checkpoints the container. func (s *service) Checkpoint(ctx context.Context, r *taskAPI.CheckpointTaskRequest) (*ptypes.Empty, error) { return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) } -// Connect returns shim information such as the shim's pid +// Connect returns shim information such as the shim's pid. func (s *service) Connect(ctx context.Context, r *taskAPI.ConnectRequest) (*taskAPI.ConnectResponse, error) { var pid int if s.task != nil { @@ -605,52 +608,52 @@ func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI. // gvisor currently (as of 2020-03-03) only returns the total memory // usage and current PID value[0]. However, we copy the common fields here // so that future updates will propagate correct information. We're - // using the cgroups.Metrics structure so we're returning the same type + // using the metrics.Metrics structure so we're returning the same type // as runc. // // [0]: https://github.com/google/gvisor/blob/277a0d5a1fbe8272d4729c01ee4c6e374d047ebc/runsc/boot/events.go#L61-L81 - data, err := typeurl.MarshalAny(&cgroups.Metrics{ - CPU: &cgroups.CPUStat{ - Usage: &cgroups.CPUUsage{ + data, err := typeurl.MarshalAny(&metrics.Metrics{ + CPU: &metrics.CPUStat{ + Usage: &metrics.CPUUsage{ Total: stats.Cpu.Usage.Total, Kernel: stats.Cpu.Usage.Kernel, User: stats.Cpu.Usage.User, PerCPU: stats.Cpu.Usage.Percpu, }, - Throttling: &cgroups.Throttle{ + Throttling: &metrics.Throttle{ Periods: stats.Cpu.Throttling.Periods, ThrottledPeriods: stats.Cpu.Throttling.ThrottledPeriods, ThrottledTime: stats.Cpu.Throttling.ThrottledTime, }, }, - Memory: &cgroups.MemoryStat{ + Memory: &metrics.MemoryStat{ Cache: stats.Memory.Cache, - Usage: &cgroups.MemoryEntry{ + Usage: &metrics.MemoryEntry{ Limit: stats.Memory.Usage.Limit, Usage: stats.Memory.Usage.Usage, Max: stats.Memory.Usage.Max, Failcnt: stats.Memory.Usage.Failcnt, }, - Swap: &cgroups.MemoryEntry{ + Swap: &metrics.MemoryEntry{ Limit: stats.Memory.Swap.Limit, Usage: stats.Memory.Swap.Usage, Max: stats.Memory.Swap.Max, Failcnt: stats.Memory.Swap.Failcnt, }, - Kernel: &cgroups.MemoryEntry{ + Kernel: &metrics.MemoryEntry{ Limit: stats.Memory.Kernel.Limit, Usage: stats.Memory.Kernel.Usage, Max: stats.Memory.Kernel.Max, Failcnt: stats.Memory.Kernel.Failcnt, }, - KernelTCP: &cgroups.MemoryEntry{ + KernelTCP: &metrics.MemoryEntry{ Limit: stats.Memory.KernelTCP.Limit, Usage: stats.Memory.KernelTCP.Usage, Max: stats.Memory.KernelTCP.Max, Failcnt: stats.Memory.KernelTCP.Failcnt, }, }, - Pids: &cgroups.PidsStat{ + Pids: &metrics.PidsStat{ Current: stats.Pids.Current, Limit: stats.Pids.Limit, }, @@ -663,12 +666,12 @@ func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI. }, nil } -// Update a running container +// Update updates a running container. func (s *service) Update(ctx context.Context, r *taskAPI.UpdateTaskRequest) (*ptypes.Empty, error) { return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) } -// Wait for a process to exit +// Wait waits for a process to exit. func (s *service) Wait(ctx context.Context, r *taskAPI.WaitRequest) (*taskAPI.WaitResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { @@ -697,7 +700,7 @@ func (s *service) checkProcesses(e proc.Exit) { for _, p := range s.allProcesses() { if p.ID() == e.ID { if ip, ok := p.(*proc.Init); ok { - // Ensure all children are killed + // Ensure all children are killed. if err := ip.KillAll(s.context); err != nil { log.G(s.context).WithError(err).WithField("id", ip.ID()). Error("failed to kill init's children") @@ -733,7 +736,7 @@ func (s *service) getContainerPids(ctx context.Context, id string) ([]uint32, er p := s.task s.mu.Unlock() if p == nil { - return nil, errors.Wrapf(errdefs.ErrFailedPrecondition, "container must be created") + return nil, fmt.Errorf("container must be created: %w", errdefs.ErrFailedPrecondition) } ps, err := p.(*proc.Init).Runtime().Ps(ctx, id) if err != nil { @@ -752,7 +755,7 @@ func (s *service) forward(publisher events.Publisher) { err := publisher.Publish(ctx, getTopic(e), e) cancel() if err != nil { - logrus.WithError(err).Error("post event") + fmt.Errorf("post event: %w", err) } } } @@ -787,7 +790,7 @@ func getTopic(e interface{}) string { case *eventstypes.TaskExecStarted: return runtime.TaskExecStartedEventTopic default: - logrus.Warnf("no topic for type %#v", e) + log.L.Printf("no topic for type %#v", e) } return runtime.TaskUnknownTopic } @@ -795,10 +798,10 @@ func getTopic(e interface{}) string { func newInit(ctx context.Context, path, workDir, namespace string, platform rproc.Platform, r *proc.CreateConfig, options *options.Options, rootfs string) (*proc.Init, error) { spec, err := utils.ReadSpec(r.Bundle) if err != nil { - return nil, errors.Wrap(err, "read oci spec") + return nil, fmt.Errorf("read oci spec: %w", err) } if err := utils.UpdateVolumeAnnotations(r.Bundle, spec); err != nil { - return nil, errors.Wrap(err, "update volume annotations") + return nil, fmt.Errorf("update volume annotations: %w", err) } runsc.FormatLogPath(r.ID, options.RunscConfig) runtime := proc.NewRunsc(options.Root, path, namespace, options.BinaryName, options.RunscConfig) @@ -814,7 +817,7 @@ func newInit(ctx context.Context, path, workDir, namespace string, platform rpro p.WorkDir = workDir p.IoUID = int(options.IoUid) p.IoGID = int(options.IoGid) - p.Sandbox = utils.IsSandbox(spec) + p.Sandbox = specutils.SpecContainerType(spec) == specutils.ContainerTypeSandbox p.UserLog = utils.UserLogPath(spec) p.Monitor = shim.Default return p, nil diff --git a/pkg/shim/v2/service_linux.go b/pkg/shim/v2/service_linux.go index cd259cd44..257c58812 100644 --- a/pkg/shim/v2/service_linux.go +++ b/pkg/shim/v2/service_linux.go @@ -19,13 +19,13 @@ package v2 import ( "context" + "fmt" "io" "sync" "syscall" "github.com/containerd/console" "github.com/containerd/fifo" - "github.com/pkg/errors" ) type linuxPlatform struct { @@ -34,7 +34,7 @@ type linuxPlatform struct { func (p *linuxPlatform) CopyConsole(ctx context.Context, console console.Console, stdin, stdout, stderr string, wg, cwg *sync.WaitGroup) (console.Console, error) { if p.epoller == nil { - return nil, errors.New("uninitialized epoller") + return nil, fmt.Errorf("uninitialized epoller") } epollConsole, err := p.epoller.Add(console) @@ -81,11 +81,11 @@ func (p *linuxPlatform) CopyConsole(ctx context.Context, console console.Console func (p *linuxPlatform) ShutdownConsole(ctx context.Context, cons console.Console) error { if p.epoller == nil { - return errors.New("uninitialized epoller") + return fmt.Errorf("uninitialized epoller") } epollConsole, ok := cons.(*console.EpollConsole) if !ok { - return errors.Errorf("expected EpollConsole, got %#v", cons) + return fmt.Errorf("expected EpollConsole, got %#v", cons) } return epollConsole.Shutdown(p.epoller.CloseConsole) } @@ -102,7 +102,7 @@ func (s *service) initPlatform() error { } epoller, err := console.NewEpoller() if err != nil { - return errors.Wrap(err, "failed to initialize epoller") + return fmt.Errorf("failed to initialize epoller: %w", err) } s.platform = &linuxPlatform{ epoller: epoller, diff --git a/pkg/test/criutil/criutil.go b/pkg/test/criutil/criutil.go index 8fed29ff5..4c63d669a 100644 --- a/pkg/test/criutil/criutil.go +++ b/pkg/test/criutil/criutil.go @@ -22,6 +22,9 @@ import ( "fmt" "os" "os/exec" + "path" + "regexp" + "strconv" "strings" "time" @@ -33,28 +36,44 @@ import ( type Crictl struct { logger testutil.Logger endpoint string + runpArgs []string cleanup []func() } // resolvePath attempts to find binary paths. It may set the path to invalid, // which will cause the execution to fail with a sensible error. func resolvePath(executable string) string { + runtime, err := dockerutil.RuntimePath() + if err == nil { + // Check first the directory of the runtime itself. + if dir := path.Dir(runtime); dir != "" && dir != "." { + guess := path.Join(dir, executable) + if fi, err := os.Stat(guess); err == nil && (fi.Mode()&0111) != 0 { + return guess + } + } + } + + // Try to find via the path. guess, err := exec.LookPath(executable) - if err != nil { - guess = fmt.Sprintf("/usr/local/bin/%s", executable) + if err == nil { + return guess } - return guess + + // Return a default path. + return fmt.Sprintf("/usr/local/bin/%s", executable) } // NewCrictl returns a Crictl configured with a timeout and an endpoint over // which it will talk to containerd. -func NewCrictl(logger testutil.Logger, endpoint string) *Crictl { +func NewCrictl(logger testutil.Logger, endpoint string, runpArgs []string) *Crictl { // Attempt to find the executable, but don't bother propagating the // error at this point. The first command executed will return with a // binary not found error. return &Crictl{ logger: logger, endpoint: endpoint, + runpArgs: runpArgs, } } @@ -67,8 +86,8 @@ func (cc *Crictl) CleanUp() { } // RunPod creates a sandbox. It corresponds to `crictl runp`. -func (cc *Crictl) RunPod(sbSpecFile string) (string, error) { - podID, err := cc.run("runp", sbSpecFile) +func (cc *Crictl) RunPod(runtime, sbSpecFile string) (string, error) { + podID, err := cc.run("runp", "--runtime", runtime, sbSpecFile) if err != nil { return "", fmt.Errorf("runp failed: %v", err) } @@ -79,10 +98,39 @@ func (cc *Crictl) RunPod(sbSpecFile string) (string, error) { // Create creates a container within a sandbox. It corresponds to `crictl // create`. func (cc *Crictl) Create(podID, contSpecFile, sbSpecFile string) (string, error) { - podID, err := cc.run("create", podID, contSpecFile, sbSpecFile) + // In version 1.16.0, crictl annoying starting attempting to pull the + // container, even if it was already available locally. We therefore + // need to parse the version and add an appropriate --no-pull argument + // since the image has already been loaded locally. + out, err := cc.run("-v") + r := regexp.MustCompile("crictl version ([0-9]+)\\.([0-9]+)\\.([0-9+])") + vs := r.FindStringSubmatch(out) + if len(vs) != 4 { + return "", fmt.Errorf("crictl -v had unexpected output: %s", out) + } + major, err := strconv.ParseUint(vs[1], 10, 64) + if err != nil { + return "", fmt.Errorf("crictl had invalid version: %v (%s)", err, out) + } + minor, err := strconv.ParseUint(vs[2], 10, 64) if err != nil { + return "", fmt.Errorf("crictl had invalid version: %v (%s)", err, out) + } + + args := []string{"create"} + if (major == 1 && minor >= 16) || major > 1 { + args = append(args, "--no-pull") + } + args = append(args, podID) + args = append(args, contSpecFile) + args = append(args, sbSpecFile) + + podID, err = cc.run(args...) + if err != nil { + time.Sleep(10 * time.Minute) // XXX return "", fmt.Errorf("create failed: %v", err) } + // Strip the trailing newline from crictl output. return strings.TrimSpace(podID), nil } @@ -260,7 +308,7 @@ func (cc *Crictl) StopContainer(contID string) error { // StartPodAndContainer starts a sandbox and container in that sandbox. It // returns the pod ID and container ID. -func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) { +func (cc *Crictl) StartPodAndContainer(runtime, image, sbSpec, contSpec string) (string, string, error) { if err := cc.Import(image); err != nil { return "", "", err } @@ -277,7 +325,7 @@ func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, } cc.cleanup = append(cc.cleanup, cleanup) - podID, err := cc.RunPod(sbSpecFile) + podID, err := cc.RunPod(runtime, sbSpecFile) if err != nil { return "", "", err } diff --git a/runsc/BUILD b/runsc/BUILD index 757f6d44c..96f697a5f 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -62,18 +62,22 @@ go_binary( ) pkg_tar( - name = "runsc-bin", - srcs = [":runsc"], + name = "debian-bin", + srcs = [ + ":runsc", + "//shim/v1:gvisor-containerd-shim", + "//shim/v2:containerd-shim-runsc-v1", + ], mode = "0755", package_dir = "/usr/bin", - strip_prefix = "/runsc/linux_amd64_pure_stripped", ) pkg_tar( name = "debian-data", extension = "tar.gz", deps = [ - ":runsc-bin", + ":debian-bin", + "//shim:config", ], ) diff --git a/runsc/debian/postinst.sh b/runsc/debian/postinst.sh index dc7aeee87..d1e28e17b 100755 --- a/runsc/debian/postinst.sh +++ b/runsc/debian/postinst.sh @@ -18,7 +18,14 @@ if [ "$1" != configure ]; then exit 0 fi +# Update docker configuration. if [ -f /etc/docker/daemon.json ]; then runsc install - systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2 + if systemctl status docker 2>/dev/null; then + systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2 + fi fi + +# For containerd-based installers, we don't automatically update the +# configuration. If it uses a v2 shim, then it will find the package binaries +# automatically when provided the appropriate annotation. diff --git a/shim/BUILD b/shim/BUILD new file mode 100644 index 000000000..e581618b2 --- /dev/null +++ b/shim/BUILD @@ -0,0 +1,15 @@ +load("//tools:defs.bzl", "pkg_tar") + +package(licenses = ["notice"]) + +pkg_tar( + name = "config", + srcs = [ + "runsc.toml", + ], + mode = "0644", + package_dir = "/etc/containerd", + visibility = [ + "//runsc:__pkg__", + ], +) diff --git a/shim/README.md b/shim/README.md index e446ec970..c6824ebdc 100644 --- a/shim/README.md +++ b/shim/README.md @@ -1,16 +1,11 @@ -# gvisor-containerd-shim +# gVisor Containerd shims -gvisor-containerd-shim is a containerd shim. It implements the containerd v1 -shim API. It can be used as a drop-in replacement for -[containerd-shim][containerd-shim] -(though containerd-shim must still be installed). It allows the use of both -gVisor (runsc) and normal containers in the same containerd installation by -deferring to the runc shim if the desired runtime engine is not runsc. +There are various shims supported for differt versions of +[containerd][containerd]. -- [Untrusted Workload Quick Start (containerd >=1.1)](docs/untrusted-workload-quickstart.md) -- [Runtime Handler/RuntimeClass Quick Start (containerd >=1.2)](docs/runtime-handler-quickstart.md) -- [Runtime Handler/RuntimeClass Quick Start (shim v2) (containerd >=1.2)](docs/runtime-handler-shim-v2-quickstart.md) -- [Configure containerd-shim-runsc-v1 (shim v2) (containerd >= 1.3)](docs/configure-containerd-shim-runsc-v1.md) -- [Configure gvisor-containerd-shim (shim v1) (containerd <= 1.2)](docs/configure-gvisor-containerd-shim.md) +- [Configure gvisor-containerd-shim (shim v1) (containerd ≤ 1.2)](v1/configure-gvisor-containerd-shim.md) +- [Runtime Handler/RuntimeClass Quick Start (containerd >= 1.2)](v2/runtime-handler-quickstart.md) +- [Runtime Handler/RuntimeClass Quick Start (shim v2) (containerd >= 1.2)](v2/runtime-handler-shim-v2-quickstart.md) +- [Configure containerd-shim-runsc-v1 (shim v2) (containerd >= 1.3)](v2/configure-containerd-shim-runsc-v1.md) -[containerd-shim]: https://github.com/containerd/containerd/tree/master/cmd/containerd-shim +[containerd]: https://github.com/containerd/containerd diff --git a/shim/configure-containerd-shim-runsc-v1.md b/shim/configure-containerd-shim-runsc-v1.md deleted file mode 100644 index 977ceacbd..000000000 --- a/shim/configure-containerd-shim-runsc-v1.md +++ /dev/null @@ -1,72 +0,0 @@ -# Configure containerd-shim-runsc-v1 (Shim V2) - -This document describes how to configure runtime options for -`containerd-shim-runsc-v1`. This is follows on to the instructions of -[Runtime Handler Quick Start (shim v2) (containerd >=1.2)](runtime-handler-shim-v2-quickstart.md) -and requires containerd 1.3 or later. - -### Update `/etc/containerd/config.toml` to point to a configuration file for `containerd-shim-runsc-v1`. - -`containerd-shim-runsc-v1` supports a few different configuration options based -on the version of containerd that is used. For versions >= 1.3, it supports a -configurable config path in the containerd runtime configuration. - -```shell -{ # Step 1: Update runtime options for runsc in containerd config.toml -cat < -[embedmd]:# (../test/e2e/shim-install.sh shell /{ # Step 1\(dev\)/ /^}/) -```shell -{ # Step 1(dev): Build and install gvisor-containerd-shim and containerd-shim-runsc-v1 - make - sudo make install -} -``` - -### Configure containerd - -1. Update `/etc/containerd/config.toml`. Make sure `containerd-shim-runsc-v1` is - in `${PATH}`. - -[embedmd]:# (../test/e2e/runtime-handler-shim-v2/install.sh shell /{ # Step 1/ /^}/) -```shell -{ # Step 1: Create containerd config.toml -cat < Note: This shim version is supported only for containerd versions less than +> 1.2. If you are using a containerd version greater than or equal to 1.2, then +> please use `containerd-shim-runsc-v1` (Shim API v1). +> +> This containerd shim is supported only in a best-effort capacity. + +This document describes how to configure and use `gvisor-containerd-shim`. + +## Containerd Configuration + +To use this shim, you must configure `/etc/containerd/config.toml` as follows: + +``` +[plugins.linux] + shim = "/usr/bin/gvisor-containerd-shim" +[plugins.cri.containerd.runtimes.gvisor] + runtime_type = "io.containerd.runtime.v1.linux" + runtime_engine = "/usr/bin/runsc" + runtime_root = "/run/containerd/runsc" +``` + +In order to pick-up the new configuration, you may need to restart containerd: + +```shell +sudo systemctl restart containerd +``` + +## Shim Confguration + +The shim configuration is stored in `/etc/containerd/runsc.toml`. The +configuration file supports two values. + +* `runc_shim`: The path to the runc shim. This is used by + `gvisor-containerd-shim` to run standard containers. + +* `runsc_config`: This is a set of key/value pairs that are converted into + `runsc` command line flags. You can learn more about which flags are available + by running `runsc flags`. + +For example, a configuration might look as follows: + +``` +runc_shim = "/usr/local/bin/containerd-shim" +[runsc_config] +platform = "kvm" +debug = true +debug-log = /var/log/%ID%/gvisor.log +``` diff --git a/shim/v1/config.go b/shim/v1/config.go new file mode 100644 index 000000000..a72cc7754 --- /dev/null +++ b/shim/v1/config.go @@ -0,0 +1,40 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import "github.com/BurntSushi/toml" + +// config is the configuration for gvisor containerd shim. +type config struct { + // RuncShim is the shim binary path for standard containerd-shim for runc. + // When the runtime is `runc`, gvisor containerd shim will exec current + // process to standard containerd-shim. This is a work around for containerd + // 1.1. In containerd 1.2, containerd will choose different containerd-shims + // based on runtime. + RuncShim string `toml:"runc_shim"` + // RunscConfig is configuration for runsc. The key value will be converted + // to runsc flags --key=value directly. + RunscConfig map[string]string `toml:"runsc_config"` +} + +// loadConfig load gvisor containerd shim config from config file. +func loadConfig(path string) (*config, error) { + var c config + _, err := toml.DecodeFile(path, &c) + if err != nil { + return &c, err + } + return &c, nil +} diff --git a/shim/v1/main.go b/shim/v1/main.go index 41c77394a..43deee858 100644 --- a/shim/v1/main.go +++ b/shim/v1/main.go @@ -16,11 +16,252 @@ package main import ( - "github.com/containerd/containerd/runtime/v2/shim" + "bytes" + "context" + "flag" + "fmt" + "log" + "net" + "os" + "os/exec" + "os/signal" + "path/filepath" + "strings" + "sync" + "syscall" - runsc "gvisor.dev/gvisor/pkg/shim/v2" + "github.com/containerd/containerd/events" + "github.com/containerd/containerd/namespaces" + "github.com/containerd/containerd/runtime/v1/linux/proc" + containerdshim "github.com/containerd/containerd/runtime/v1/shim" + shimapi "github.com/containerd/containerd/runtime/v1/shim/v1" + "github.com/containerd/ttrpc" + "github.com/containerd/typeurl" + ptypes "github.com/gogo/protobuf/types" + "github.com/opencontainers/runc/libcontainer/system" + "golang.org/x/sys/unix" + + "gvisor.dev/gvisor/pkg/shim/runsc" + "gvisor.dev/gvisor/pkg/shim/v1/shim" +) + +var ( + debugFlag bool + namespaceFlag string + socketFlag string + addressFlag string + workdirFlag string + runtimeRootFlag string + containerdBinaryFlag string + shimConfigFlag string ) +func init() { + flag.BoolVar(&debugFlag, "debug", false, "enable debug output in logs") + flag.StringVar(&namespaceFlag, "namespace", "", "namespace that owns the shim") + flag.StringVar(&socketFlag, "socket", "", "abstract socket path to serve") + flag.StringVar(&addressFlag, "address", "", "grpc address back to main containerd") + flag.StringVar(&workdirFlag, "workdir", "", "path used to storge large temporary data") + + // Containerd default to runc, unless another runtime is explicitly + // specified. We keep the same default to make the default behavior + // consistent. + flag.StringVar(&runtimeRootFlag, "runtime-root", proc.RuncRoot, "root directory for the runtime") + + // Currently, the `containerd publish` utility is embedded in the + // daemon binary. The daemon invokes `containerd-shim + // -containerd-binary ...` with its own os.Executable() path. + flag.StringVar(&containerdBinaryFlag, "containerd-binary", "containerd", "path to containerd binary (used for `containerd publish`)") + flag.StringVar(&shimConfigFlag, "config", "/etc/containerd/runsc.toml", "path to the shim configuration file") +} + func main() { - shim.Run("io.containerd.runsc.v1", runsc.New) + flag.Parse() + + // This is a hack. Exec current process to run standard containerd-shim + // if runtime root is not `runsc`. We don't need this for shim v2 api. + if filepath.Base(runtimeRootFlag) != "runsc" { + if err := executeRuncShim(); err != nil { + fmt.Fprintf(os.Stderr, "gvisor-containerd-shim: %s\n", err) + os.Exit(1) + } + } + + // Run regular shim if needed. + if err := executeShim(); err != nil { + fmt.Fprintf(os.Stderr, "gvisor-containerd-shim: %s\n", err) + os.Exit(1) + } +} + +// executeRuncShim execs current process to a containerd-shim process and +// retains all flags and envs. +func executeRuncShim() error { + c, err := loadConfig(shimConfigFlag) + if err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to load shim config: %w", err) + } + shimPath := c.RuncShim + if shimPath == "" { + shimPath, err = exec.LookPath("containerd-shim") + if err != nil { + return fmt.Errorf("lookup containerd-shim failed: %w", err) + } + } + + args := append([]string{shimPath}, os.Args[1:]...) + if err := syscall.Exec(shimPath, args, os.Environ()); err != nil { + return fmt.Errorf("exec containerd-shim @ %q failed: %w", shimPath, err) + } + return nil +} + +func executeShim() error { + // start handling signals as soon as possible so that things are + // properly reaped or if runtime exits before we hit the handler. + signals, err := setupSignals() + if err != nil { + return err + } + path, err := os.Getwd() + if err != nil { + return err + } + server, err := ttrpc.NewServer(ttrpc.WithServerHandshaker(ttrpc.UnixSocketRequireSameUser())) + if err != nil { + return fmt.Errorf("failed creating server: %w", err) + } + c, err := loadConfig(shimConfigFlag) + if err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to load shim config: %w", err) + } + sv, err := shim.NewService( + shim.Config{ + Path: path, + Namespace: namespaceFlag, + WorkDir: workdirFlag, + RuntimeRoot: runtimeRootFlag, + RunscConfig: c.RunscConfig, + }, + &remoteEventsPublisher{address: addressFlag}, + ) + if err != nil { + return err + } + shimapi.RegisterShimService(server, sv) + if err := serve(server, socketFlag); err != nil { + return err + } + return handleSignals(signals, server, sv) +} + +// serve serves the ttrpc API over a unix socket at the provided path this +// function does not block. +func serve(server *ttrpc.Server, path string) error { + var ( + l net.Listener + err error + ) + if path == "" { + l, err = net.FileListener(os.NewFile(3, "socket")) + path = "[inherited from parent]" + } else { + if len(path) > 106 { + return fmt.Errorf("%q: unix socket path too long (> 106)", path) + } + l, err = net.Listen("unix", "\x00"+path) + } + if err != nil { + return err + } + go func() { + defer l.Close() + err := server.Serve(context.Background(), l) + if err != nil && !strings.Contains(err.Error(), "use of closed network connection") { + log.Fatalf("ttrpc server failure: %v", err) + } + }() + return nil +} + +// setupSignals creates a new signal handler for all signals and sets the shim +// as a sub-reaper so that the container processes are reparented. +func setupSignals() (chan os.Signal, error) { + signals := make(chan os.Signal, 32) + signal.Notify(signals, unix.SIGTERM, unix.SIGINT, unix.SIGCHLD, unix.SIGPIPE) + // make sure runc is setup to use the monitor for waiting on processes. + // TODO(random-liu): Move shim/reaper.go to a separate package. + runsc.Monitor = containerdshim.Default + // Set the shim as the subreaper for all orphaned processes created by + // the container. + if err := system.SetSubreaper(1); err != nil { + return nil, err + } + return signals, nil +} + +func handleSignals(signals chan os.Signal, server *ttrpc.Server, sv *shim.Service) error { + var ( + termOnce sync.Once + done = make(chan struct{}) + ) + + for { + select { + case <-done: + return nil + case s := <-signals: + switch s { + case unix.SIGCHLD: + if err := containerdshim.Reap(); err != nil { + log.Printf("reap exit status: %v") + } + case unix.SIGTERM, unix.SIGINT: + go termOnce.Do(func() { + ctx := context.TODO() + if err := server.Shutdown(ctx); err != nil { + log.Printf("failed to shutdown server: %v") + } + // Ensure our child is dead if any. + sv.Kill(ctx, &shimapi.KillRequest{ + Signal: uint32(syscall.SIGKILL), + All: true, + }) + sv.Delete(context.Background(), &ptypes.Empty{}) + close(done) + }) + case unix.SIGPIPE: + } + } + } +} + +type remoteEventsPublisher struct { + address string +} + +func (l *remoteEventsPublisher) Publish(ctx context.Context, topic string, event events.Event) error { + ns, _ := namespaces.Namespace(ctx) + encoded, err := typeurl.MarshalAny(event) + if err != nil { + return err + } + data, err := encoded.Marshal() + if err != nil { + return err + } + cmd := exec.CommandContext(ctx, containerdBinaryFlag, "--address", l.address, "publish", "--topic", topic, "--namespace", ns) + cmd.Stdin = bytes.NewReader(data) + c, err := containerdshim.Default.Start(cmd) + if err != nil { + return err + } + status, err := containerdshim.Default.Wait(cmd, c) + if err != nil { + return fmt.Errorf("failed to publish event: %w", err) + } + if status != 0 { + return fmt.Errorf("failed to publish event: status %d", status) + } + return nil } diff --git a/shim/v2/BUILD b/shim/v2/BUILD new file mode 100644 index 000000000..1e1947dab --- /dev/null +++ b/shim/v2/BUILD @@ -0,0 +1,32 @@ +load("//tools:defs.bzl", "go_binary", "pkg_tar") +load("//website:defs.bzl", "doc") + +package(licenses = ["notice"]) + +go_binary( + name = "containerd-shim-runsc-v1", + srcs = [ + "main.go", + ], + pure = True, + visibility = [ + "//visibility:public", + ], + deps = [ + "//pkg/shim/runsc", + "//pkg/shim/v1/shim", + "//pkg/shim/v2", + "@com_github_burntsushi_toml//:go_default_library", + "@com_github_containerd_containerd//runtime/v2/shim:go_default_library", + ], +) + +doc( + name = "doc", + src = "README.md", + category = "User Guide", + permalink = "/docs/user_guide/containerd-shim-runsc-v1/", + subcategory = "Advanced", + visibility = ["//website:__pkg__"], + weight = "92", +) diff --git a/shim/v2/README.md b/shim/v2/README.md new file mode 100644 index 000000000..2fd625415 --- /dev/null +++ b/shim/v2/README.md @@ -0,0 +1,90 @@ +# containerd-shim-runsc-v1 + +> Note: This shim version is the recommended shim for containerd versions +> greater than or equal to 1.2. For older versions of containerd, use +> `gvisor-containerd-shim`. + +This document describes how to configure and use `containerd-shim-runsc-v1`. + +## Configuring Containerd 1.2 + +To configure containerd 1.2 to use this shim, add the runtime to +`/etc/containerd/config.toml` as follows: + +``` +[plugins.cri.containerd.runtimes.runsc] + runtime_type = "io.containerd.runsc.v1" + runtime_root = "/run/containerd/runsc" +[plugins.cri.containerd.runtimes.runsc.options] + TypeUrl = "io.containerd.runsc.v1.options" +``` + +The configuration will optionally loaded from a file named `config.toml` in the +`runtime_root` configured above. + +In order to pick up the new configuration, you may need to restart containerd: + +```shell +sudo systemctl restart containerd +``` + +## Configuring Containerd 1.3 and above + +To configure containerd 1.3 to use this shim, add the runtime to +`/etc/containerd/config.toml` as follows: + +``` +[plugins.cri.containerd.runtimes.runsc] + runtime_type = "io.containerd.runsc.v1" +[plugins.cri.containerd.runtimes.runsc.options] + TypeUrl = "io.containerd.runsc.v1.options" + ConfigPath = "/etc/containerd/runsc.toml" +``` + +The `ConfigPath` above will be used to provide a pointer to the configuration +file to be loaded. + +> Note that there will be configuration file loaded if `ConfigPath` is not set. + +In order to pick up the new configuration, you may need to restart containerd: + +```shell +sudo systemctl restart containerd +``` +## Shim Confguration + +The shim configuration may carry the following options: + +* `shim_cgroup`: The cgroup to use for the shim itself. +* `io_uid`: The UID to use for pipes. +* `ui_gid`: The GID to use for pipes. +* `binary_name`: The runtime binary name (defaults to `runsc`). +* `root`: The root directory for the runtime. +* `runsc_config`: A dictionary of key-value pairs that will be passed to the + runtime as arguments. + +### Example: Enable the KVM platform + +gVisor enables the use of a number of platforms. This example shows how to +configure `containerd-shim-runsc-v1` to use gVisor with the KVM platform: + +```shell +cat < 106 { - return errors.Errorf("%q: unix socket path too long (> 106)", path) - } - l, err = net.Listen("unix", "\x00"+path) - } - if err != nil { - return err - } - logrus.WithField("socket", path).Debug("serving api on unix socket") - go func() { - defer l.Close() - if err := server.Serve(context.Background(), l); err != nil && - !strings.Contains(err.Error(), "use of closed network connection") { - logrus.WithError(err).Fatal("gvisor-containerd-shim: ttrpc server failure") - } - }() - return nil -} - -// setupSignals creates a new signal handler for all signals and sets the shim as a -// sub-reaper so that the container processes are reparented -func setupSignals() (chan os.Signal, error) { - signals := make(chan os.Signal, 32) - signal.Notify(signals, unix.SIGTERM, unix.SIGINT, unix.SIGCHLD, unix.SIGPIPE) - // make sure runc is setup to use the monitor - // for waiting on processes - // TODO(random-liu): Move shim/reaper.go to a separate package. - runsc.Monitor = containerdshim.Default - // set the shim as the subreaper for all orphaned processes created by the container - if err := system.SetSubreaper(1); err != nil { - return nil, err - } - return signals, nil -} - -func handleSignals(logger *logrus.Entry, signals chan os.Signal, server *ttrpc.Server, sv *shim.Service) error { - var ( - termOnce sync.Once - done = make(chan struct{}) - ) - - for { - select { - case <-done: - return nil - case s := <-signals: - switch s { - case unix.SIGCHLD: - if err := containerdshim.Reap(); err != nil { - logger.WithError(err).Error("reap exit status") - } - case unix.SIGTERM, unix.SIGINT: - go termOnce.Do(func() { - ctx := context.TODO() - if err := server.Shutdown(ctx); err != nil { - logger.WithError(err).Error("failed to shutdown server") - } - // Ensure our child is dead if any - sv.Kill(ctx, &shimapi.KillRequest{ - Signal: uint32(syscall.SIGKILL), - All: true, - }) - sv.Delete(context.Background(), &ptypes.Empty{}) - close(done) - }) - case unix.SIGPIPE: - } - } - } -} - -func dumpStacks(logger *logrus.Entry) { - var ( - buf []byte - stackSize int - ) - bufferLen := 16384 - for stackSize == len(buf) { - buf = make([]byte, bufferLen) - stackSize = runtime.Stack(buf, true) - bufferLen *= 2 - } - buf = buf[:stackSize] - logger.Infof("=== BEGIN goroutine stack dump ===\n%s\n=== END goroutine stack dump ===", buf) -} - -type remoteEventsPublisher struct { - address string -} - -func (l *remoteEventsPublisher) Publish(ctx context.Context, topic string, event events.Event) error { - ns, _ := namespaces.Namespace(ctx) - encoded, err := typeurl.MarshalAny(event) - if err != nil { - return err - } - data, err := encoded.Marshal() - if err != nil { - return err - } - cmd := exec.CommandContext(ctx, containerdBinaryFlag, "--address", l.address, "publish", "--topic", topic, "--namespace", ns) - cmd.Stdin = bytes.NewReader(data) - c, err := containerdshim.Default.Start(cmd) - if err != nil { - return err - } - status, err := containerdshim.Default.Wait(cmd, c) - if err != nil { - return err - } - if status != 0 { - return errors.New("failed to publish event") - } - return nil + shim.Run("io.containerd.runsc.v1", runsc.New) } diff --git a/shim/v2/runtime-handler-shim-v2-quickstart.md b/shim/v2/runtime-handler-shim-v2-quickstart.md new file mode 100644 index 000000000..ca8336089 --- /dev/null +++ b/shim/v2/runtime-handler-shim-v2-quickstart.md @@ -0,0 +1,232 @@ +# Runtime Handler Quickstart (Shim V2) + +This document describes how to install and run `containerd-shim-runsc-v1` using +the containerd runtime handler support. This requires containerd 1.2 or later. + +## Requirements + +- **runsc**: See the [gVisor documentation](https://github.com/google/gvisor) for information on how to install runsc. +- **containerd**: See the [containerd website](https://containerd.io/) for information on how to install containerd. + +## Install + +### Install containerd-shim-runsc-v1 + +1. Build and install `containerd-shim-runsc-v1`. + + +[embedmd]:# (../test/e2e/shim-install.sh shell /{ # Step 1\(dev\)/ /^}/) +```shell +{ # Step 1(dev): Build and install gvisor-containerd-shim and containerd-shim-runsc-v1 + make + sudo make install +} +``` + +### Configure containerd + +1. Update `/etc/containerd/config.toml`. Make sure `containerd-shim-runsc-v1` is + in `${PATH}`. + +[embedmd]:# (../test/e2e/runtime-handler-shim-v2/install.sh shell /{ # Step 1/ /^}/) +```shell +{ # Step 1: Create containerd config.toml +cat < 0 { // Omit if empty. s["command"] = cmd @@ -95,25 +98,29 @@ var Httpd = SimpleSpec("httpd", "basic/httpd", nil, nil) // TestCrictlSanity refers to b/112433158. func TestCrictlSanity(t *testing.T) { - // Setup containerd and crictl. - crictl, cleanup, err := setup(t) - if err != nil { - t.Fatalf("failed to setup crictl: %v", err) - } - defer cleanup() - podID, contID, err := crictl.StartPodAndContainer("basic/httpd", Sandbox("default"), Httpd) - if err != nil { - t.Fatalf("start failed: %v", err) - } - - // Look for the httpd page. - if err = httpGet(crictl, podID, "index.html"); err != nil { - t.Fatalf("failed to get page: %v", err) - } - - // Stop everything. - if err := crictl.StopPodAndContainer(podID, contID); err != nil { - t.Fatalf("stop failed: %v", err) + for _, version := range allVersions { + t.Run(version, func(t *testing.T) { + // Setup containerd and crictl. + crictl, cleanup, err := setup(t, version) + if err != nil { + t.Fatalf("failed to setup crictl: %v", err) + } + defer cleanup() + podID, contID, err := crictl.StartPodAndContainer(containerdRuntime, "basic/httpd", Sandbox("default"), Httpd) + if err != nil { + t.Fatalf("start failed: %v", err) + } + + // Look for the httpd page. + if err = httpGet(crictl, podID, "index.html"); err != nil { + t.Fatalf("failed to get page: %v", err) + } + + // Stop everything. + if err := crictl.StopPodAndContainer(podID, contID); err != nil { + t.Fatalf("stop failed: %v", err) + } + }) } } @@ -147,146 +154,179 @@ var HttpdMountPaths = SimpleSpec("httpd", "basic/httpd", nil, map[string]interfa // TestMountPaths refers to b/117635704. func TestMountPaths(t *testing.T) { - // Setup containerd and crictl. - crictl, cleanup, err := setup(t) - if err != nil { - t.Fatalf("failed to setup crictl: %v", err) - } - defer cleanup() - podID, contID, err := crictl.StartPodAndContainer("basic/httpd", Sandbox("default"), HttpdMountPaths) - if err != nil { - t.Fatalf("start failed: %v", err) - } - - // Look for the directory available at /test. - if err = httpGet(crictl, podID, "test"); err != nil { - t.Fatalf("failed to get page: %v", err) - } - - // Stop everything. - if err := crictl.StopPodAndContainer(podID, contID); err != nil { - t.Fatalf("stop failed: %v", err) + for _, version := range allVersions { + t.Run(version, func(t *testing.T) { + // Setup containerd and crictl. + crictl, cleanup, err := setup(t, version) + if err != nil { + t.Fatalf("failed to setup crictl: %v", err) + } + defer cleanup() + podID, contID, err := crictl.StartPodAndContainer(containerdRuntime, "basic/httpd", Sandbox("default"), HttpdMountPaths) + if err != nil { + t.Fatalf("start failed: %v", err) + } + + // Look for the directory available at /test. + if err = httpGet(crictl, podID, "test"); err != nil { + t.Fatalf("failed to get page: %v", err) + } + + // Stop everything. + if err := crictl.StopPodAndContainer(podID, contID); err != nil { + t.Fatalf("stop failed: %v", err) + } + }) } } // TestMountPaths refers to b/118728671. func TestMountOverSymlinks(t *testing.T) { - // Setup containerd and crictl. - crictl, cleanup, err := setup(t) - if err != nil { - t.Fatalf("failed to setup crictl: %v", err) - } - defer cleanup() - - spec := SimpleSpec("busybox", "basic/resolv", []string{"sleep", "1000"}, nil) - podID, contID, err := crictl.StartPodAndContainer("basic/resolv", Sandbox("default"), spec) - if err != nil { - t.Fatalf("start failed: %v", err) - } - - out, err := crictl.Exec(contID, "readlink", "/etc/resolv.conf") - if err != nil { - t.Fatalf("readlink failed: %v, out: %s", err, out) - } - if want := "/tmp/resolv.conf"; !strings.Contains(string(out), want) { - t.Fatalf("/etc/resolv.conf is not pointing to %q: %q", want, string(out)) - } - - etc, err := crictl.Exec(contID, "cat", "/etc/resolv.conf") - if err != nil { - t.Fatalf("cat failed: %v, out: %s", err, etc) - } - tmp, err := crictl.Exec(contID, "cat", "/tmp/resolv.conf") - if err != nil { - t.Fatalf("cat failed: %v, out: %s", err, out) - } - if tmp != etc { - t.Fatalf("file content doesn't match:\n\t/etc/resolv.conf: %s\n\t/tmp/resolv.conf: %s", string(etc), string(tmp)) - } - - // Stop everything. - if err := crictl.StopPodAndContainer(podID, contID); err != nil { - t.Fatalf("stop failed: %v", err) + for _, version := range allVersions { + t.Run(version, func(t *testing.T) { + // Setup containerd and crictl. + crictl, cleanup, err := setup(t, version) + if err != nil { + t.Fatalf("failed to setup crictl: %v", err) + } + defer cleanup() + + spec := SimpleSpec("busybox", "basic/resolv", []string{"sleep", "1000"}, nil) + podID, contID, err := crictl.StartPodAndContainer(containerdRuntime, "basic/resolv", Sandbox("default"), spec) + if err != nil { + t.Fatalf("start failed: %v", err) + } + + out, err := crictl.Exec(contID, "readlink", "/etc/resolv.conf") + if err != nil { + t.Fatalf("readlink failed: %v, out: %s", err, out) + } + if want := "/tmp/resolv.conf"; !strings.Contains(string(out), want) { + t.Fatalf("/etc/resolv.conf is not pointing to %q: %q", want, string(out)) + } + + etc, err := crictl.Exec(contID, "cat", "/etc/resolv.conf") + if err != nil { + t.Fatalf("cat failed: %v, out: %s", err, etc) + } + tmp, err := crictl.Exec(contID, "cat", "/tmp/resolv.conf") + if err != nil { + t.Fatalf("cat failed: %v, out: %s", err, out) + } + if tmp != etc { + t.Fatalf("file content doesn't match:\n\t/etc/resolv.conf: %s\n\t/tmp/resolv.conf: %s", string(etc), string(tmp)) + } + + // Stop everything. + if err := crictl.StopPodAndContainer(podID, contID); err != nil { + t.Fatalf("stop failed: %v", err) + } + }) } } // TestHomeDir tests that the HOME environment variable is set for // Pod containers. func TestHomeDir(t *testing.T) { - // Setup containerd and crictl. - crictl, cleanup, err := setup(t) - if err != nil { - t.Fatalf("failed to setup crictl: %v", err) + for _, version := range allVersions { + t.Run(version, func(t *testing.T) { + // Setup containerd and crictl. + crictl, cleanup, err := setup(t, version) + if err != nil { + t.Fatalf("failed to setup crictl: %v", err) + } + defer cleanup() + + // Note that container ID returned here is a sub-container. All Pod + // containers are sub-containers. The root container of the sandbox is the + // pause container. + t.Run("sub-container", func(t *testing.T) { + contSpec := SimpleSpec("subcontainer", "basic/busybox", []string{"sh", "-c", "echo $HOME"}, nil) + podID, contID, err := crictl.StartPodAndContainer(containerdRuntime, "basic/busybox", Sandbox("subcont-sandbox"), contSpec) + if err != nil { + t.Fatalf("start failed: %v", err) + } + + out, err := crictl.Logs(contID) + if err != nil { + t.Fatalf("failed retrieving container logs: %v, out: %s", err, out) + } + if got, want := strings.TrimSpace(string(out)), "/root"; got != want { + t.Fatalf("Home directory invalid. Got %q, Want : %q", got, want) + } + + // Stop everything; note that the pod may have already stopped. + crictl.StopPodAndContainer(podID, contID) + }) + + // Tests that HOME is set for the exec process. + t.Run("exec", func(t *testing.T) { + contSpec := SimpleSpec("exec", "basic/busybox", []string{"sleep", "1000"}, nil) + podID, contID, err := crictl.StartPodAndContainer(containerdRuntime, "basic/busybox", Sandbox("exec-sandbox"), contSpec) + if err != nil { + t.Fatalf("start failed: %v", err) + } + + out, err := crictl.Exec(contID, "sh", "-c", "echo $HOME") + if err != nil { + t.Fatalf("failed retrieving container logs: %v, out: %s", err, out) + } + if got, want := strings.TrimSpace(string(out)), "/root"; got != want { + t.Fatalf("Home directory invalid. Got %q, Want : %q", got, want) + } + + // Stop everything. + if err := crictl.StopPodAndContainer(podID, contID); err != nil { + t.Fatalf("stop failed: %v", err) + } + }) + }) } - defer cleanup() - - // Note that container ID returned here is a sub-container. All Pod - // containers are sub-containers. The root container of the sandbox is the - // pause container. - t.Run("sub-container", func(t *testing.T) { - contSpec := SimpleSpec("subcontainer", "basic/busybox", []string{"sh", "-c", "echo $HOME"}, nil) - podID, contID, err := crictl.StartPodAndContainer("basic/busybox", Sandbox("subcont-sandbox"), contSpec) - if err != nil { - t.Fatalf("start failed: %v", err) - } - - out, err := crictl.Logs(contID) - if err != nil { - t.Fatalf("failed retrieving container logs: %v, out: %s", err, out) - } - if got, want := strings.TrimSpace(string(out)), "/root"; got != want { - t.Fatalf("Home directory invalid. Got %q, Want : %q", got, want) - } - - // Stop everything. - if err := crictl.StopPodAndContainer(podID, contID); err != nil { - t.Fatalf("stop failed: %v", err) - } - }) - - // Tests that HOME is set for the exec process. - t.Run("exec", func(t *testing.T) { - contSpec := SimpleSpec("exec", "basic/busybox", []string{"sleep", "1000"}, nil) - podID, contID, err := crictl.StartPodAndContainer("basic/busybox", Sandbox("exec-sandbox"), contSpec) - if err != nil { - t.Fatalf("start failed: %v", err) - } - - out, err := crictl.Exec(contID, "sh", "-c", "echo $HOME") - if err != nil { - t.Fatalf("failed retrieving container logs: %v, out: %s", err, out) - } - if got, want := strings.TrimSpace(string(out)), "/root"; got != want { - t.Fatalf("Home directory invalid. Got %q, Want : %q", got, want) - } - - // Stop everything. - if err := crictl.StopPodAndContainer(podID, contID); err != nil { - t.Fatalf("stop failed: %v", err) - } - }) } -// containerdConfigTemplate is a .toml config for containerd. It contains a -// formatting verb so the runtime field can be set via fmt.Sprintf. -const containerdConfigTemplate = ` +const containerdRuntime = "runsc" + +const v1Template = ` disabled_plugins = ["restart"] +[plugins.cri] + disable_tcp_service = true [plugins.linux] - runtime = "%s" - runtime_root = "/tmp/test-containerd/runsc" - shim = "/usr/local/bin/gvisor-containerd-shim" + shim = "%s" shim_debug = true - -[plugins.cri.containerd.runtimes.runsc] +[plugins.cri.containerd.runtimes.` + containerdRuntime + `] runtime_type = "io.containerd.runtime.v1.linux" runtime_engine = "%s" + runtime_root = "%s/root/runsc" ` +const v2Template = ` +disabled_plugins = ["restart"] +[plugins.cri] + disable_tcp_service = true +[plugins.linux] + shim_debug = true +[plugins.cri.containerd.runtimes.` + containerdRuntime + `] + runtime_type = "io.containerd.` + containerdRuntime + `.v1" +[plugins.cri.containerd.runtimes.` + containerdRuntime + `.options] + TypeUrl = "io.containerd.` + containerdRuntime + `.v1.options" +` + +const ( + // v1 is the containerd API v1. + v1 string = "v1" + + // v1 is the containerd API v21. + v2 string = "v2" +) + +// allVersions is the set of known versions. +var allVersions = []string{v1, v2} + // setup sets up before a test. Specifically it: // * Creates directories and a socket for containerd to utilize. // * Runs containerd and waits for it to reach a "ready" state for testing. // * Returns a cleanup function that should be called at the end of the test. -func setup(t *testing.T) (*criutil.Crictl, func(), error) { +func setup(t *testing.T, version string) (*criutil.Crictl, func(), error) { // Create temporary containerd root and state directories, and a socket // via which crictl and containerd communicate. containerdRoot, err := ioutil.TempDir(testutil.TmpDir(), "containerd-root") @@ -295,13 +335,43 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) { } cu := cleanup.Make(func() { os.RemoveAll(containerdRoot) }) defer cu.Clean() + t.Logf("Using containerd root: %s", containerdRoot) containerdState, err := ioutil.TempDir(testutil.TmpDir(), "containerd-state") if err != nil { t.Fatalf("failed to create containerd state: %v", err) } cu.Add(func() { os.RemoveAll(containerdState) }) - sockAddr := filepath.Join(testutil.TmpDir(), "containerd-test.sock") + t.Logf("Using containerd state: %s", containerdState) + + sockDir, err := ioutil.TempDir(testutil.TmpDir(), "containerd-sock") + if err != nil { + t.Fatalf("failed to create containerd socket directory: %v", err) + } + cu.Add(func() { os.RemoveAll(sockDir) }) + sockAddr := path.Join(sockDir, "test.sock") + t.Logf("Using containerd socket: %s", sockAddr) + + // Extract the containerd version. + versionCmd := exec.Command(getContainerd(), "-v") + out, err := versionCmd.CombinedOutput() + if err != nil { + t.Fatalf("error extracting containerd version: %v (%s)", err, string(out)) + } + r := regexp.MustCompile(" v([0-9]+)\\.([0-9]+)\\.([0-9+])") + vs := r.FindStringSubmatch(string(out)) + if len(vs) != 4 { + t.Fatalf("error unexpected version string: %s", string(out)) + } + major, err := strconv.ParseUint(vs[1], 10, 64) + if err != nil { + t.Fatalf("error parsing containerd major version: %v (%s)", err, string(out)) + } + minor, err := strconv.ParseUint(vs[2], 10, 64) + if err != nil { + t.Fatalf("error parsing containerd minor version: %v (%s)", err, string(out)) + } + t.Logf("Using containerd version: %d.%d", major, minor) // We rewrite a configuration. This is based on the current docker // configuration for the runtime under test. @@ -309,28 +379,100 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) { if err != nil { t.Fatalf("error discovering runtime path: %v", err) } - config, configCleanup, err := testutil.WriteTmpFile("containerd-config", fmt.Sprintf(containerdConfigTemplate, runtime, runtime)) + t.Logf("Using runtime: %v", runtime) + + // Construct a PATH that includes the runtime directory. This is + // because the shims will be installed there, and containerd may infer + // the binary name and search the PATH. + runtimeDir := path.Dir(runtime) + modifiedPath := os.Getenv("PATH") + if modifiedPath != "" { + modifiedPath = ":" + modifiedPath // We prepend below. + } + modifiedPath = path.Dir(getContainerd()) + modifiedPath + modifiedPath = runtimeDir + ":" + modifiedPath + t.Logf("Using PATH: %v", modifiedPath) + + var ( + config string + runpArgs []string + ) + switch version { + case v1: + // This is only supported less than 1.3. + if major > 1 || (major == 1 && minor >= 3) { + t.Skipf("skipping unsupported containerd (want less than 1.3, got %d.%d)", major, minor) + } + + // We provide the shim, followed by the runtime, and then a + // temporary root directory. Note that we can safely assume + // that the shim has been installed in the same directory as + // the runtime (for test installs and for normal installs). + // Since this is v1, the binary name will be fixed. + config = fmt.Sprintf(v1Template, path.Join(runtimeDir, "gvisor-containerd-shim"), runtime, runtimeDir) + case v2: + // This is only supported past 1.2. + if major < 1 || (major == 1 && minor <= 1) { + t.Skipf("skipping incompatible containerd (want at least 1.2, got %d.%d)", major, minor) + } + + // The runtime is provided via parameter. Note that the v2 shim + // binary name is always containerd-shim-* so we don't actually + // care about the docker runtime name. + config = v2Template + default: + t.Fatalf("unknown version: %d", version) + } + t.Logf("Using config: %s", config) + + // Generate the configuration for the test. + configFile, configCleanup, err := testutil.WriteTmpFile("containerd-config", config) if err != nil { t.Fatalf("failed to write containerd config") } cu.Add(configCleanup) // Start containerd. - cmd := exec.Command(getContainerd(), - "--config", config, + args := []string{ + getContainerd(), + "--config", configFile, "--log-level", "debug", "--root", containerdRoot, "--state", containerdState, - "--address", sockAddr) + "--address", sockAddr, + } + t.Logf("Using args: %s", strings.Join(args, " ")) + cmd := exec.Command(args[0], args[1:]...) + cmd.Env = append(os.Environ(), "PATH="+modifiedPath) + + // Include output in logs. + stderrPipe, err := cmd.StderrPipe() + if err != nil { + t.Fatalf("failed to create stderr pipe: %v", err) + } + cu.Add(func() { stderrPipe.Close() }) + stdoutPipe, err := cmd.StdoutPipe() + if err != nil { + t.Fatalf("failed to create stdout pipe: %v", err) + } + cu.Add(func() { stdoutPipe.Close() }) + var ( + wg sync.WaitGroup + stderr bytes.Buffer + stdout bytes.Buffer + ) startupR, startupW := io.Pipe() - defer startupR.Close() - defer startupW.Close() - stderr := &bytes.Buffer{} - stdout := &bytes.Buffer{} - cmd.Stderr = io.MultiWriter(startupW, stderr) - cmd.Stdout = io.MultiWriter(startupW, stdout) + wg.Add(2) + go func() { + defer wg.Done() + io.Copy(io.MultiWriter(startupW, &stderr), stderrPipe) + }() + go func() { + defer wg.Done() + io.Copy(io.MultiWriter(startupW, &stdout), stdoutPipe) + }() cu.Add(func() { - // Log output in case of failure. + wg.Wait() t.Logf("containerd stdout: %s", stdout.String()) t.Logf("containerd stderr: %s", stderr.String()) }) @@ -345,13 +487,17 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) { t.Fatalf("failed to start containerd: %v", err) } + // Discard all subsequent data. + go io.Copy(ioutil.Discard, startupR) + + // Create the crictl interface. + cc := criutil.NewCrictl(t, sockAddr, runpArgs) + cu.Add(cc.CleanUp) + // Kill must be the last cleanup (as it will be executed first). - cc := criutil.NewCrictl(t, sockAddr) cu.Add(func() { - cc.CleanUp() // Remove tmp files, etc. - if err := testutil.KillCommand(cmd); err != nil { - log.Printf("error killing containerd: %v", err) - } + // Best effort: ignore errors. + testutil.KillCommand(cmd) }) return cc, cu.Release(), nil diff --git a/test/shim/containerd-install.sh b/test/shim/containerd-install.sh deleted file mode 100755 index 400819245..000000000 --- a/test/shim/containerd-install.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# A script to install containerd and CNI plugins for e2e testing - -wget -q --https-only \ - https://github.com/containerd/containerd/releases/download/v${CONTAINERD_VERSION}/containerd-${CONTAINERD_VERSION}.linux-amd64.tar.gz \ - https://github.com/containernetworking/plugins/releases/download/v0.7.0/cni-plugins-amd64-v0.7.0.tgz - -sudo mkdir -p /etc/containerd /etc/cni/net.d /opt/cni/bin -sudo tar -xvf cni-plugins-amd64-v0.7.0.tgz -C /opt/cni/bin/ -sudo tar -xvf containerd-${CONTAINERD_VERSION}.linux-amd64.tar.gz -C / - -cat </tmp/containerd-cri.log & diff --git a/test/shim/crictl-install.sh b/test/shim/crictl-install.sh deleted file mode 100755 index 1d63c889b..000000000 --- a/test/shim/crictl-install.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -# A sample script for installing crictl. - -set -ex - -{ # Step 1: Download crictl -wget https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.13.0/crictl-v1.13.0-linux-amd64.tar.gz -tar xf crictl-v1.13.0-linux-amd64.tar.gz -sudo mv crictl /usr/local/bin -} - -{ # Step 2: Configure crictl -cat < /tmp/containerd-cri.log & -} diff --git a/test/shim/runtime-handler-shim-v2/test.sh b/test/shim/runtime-handler-shim-v2/test.sh deleted file mode 100755 index e33655ec1..000000000 --- a/test/shim/runtime-handler-shim-v2/test.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -# Runs end-to-end tests for gvisor-containerd-shim to test the use of runtime -# handler. This should work on containerd 1.2+ - -# This is meant to be run in a VM as it makes a fairly invasive install of -# containerd. - -set -ex - -# Install containerd -. ./test/e2e/containerd-install.sh - -# Install gVisor -. ./test/e2e/runsc-install.sh - -# Install gvisor-containerd-shim -. ./test/e2e/shim-install.sh - -# Test installation/configuration -. ./test/e2e/runtime-handler-shim-v2/install.sh - -# Install crictl -. ./test/e2e/crictl-install.sh - -# Test usage (the same with runtime-handler) -. ./test/e2e/runtime-handler/usage.sh - -# Run a container in the sandbox -. ./test/e2e/run-container.sh - -# Validate the pod and container -. ./test/e2e/validate.sh -. ./test/e2e/runtime-handler-shim-v2/validate.sh diff --git a/test/shim/runtime-handler-shim-v2/validate.sh b/test/shim/runtime-handler-shim-v2/validate.sh deleted file mode 100755 index b74a059ef..000000000 --- a/test/shim/runtime-handler-shim-v2/validate.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# A sample script to validating the running containerd-shim-runsc-v1. - -set -ex - -ps aux | grep [c]ontainerd-shim-runsc-v1 diff --git a/test/shim/runtime-handler/install.sh b/test/shim/runtime-handler/install.sh deleted file mode 100755 index ebe9d3580..000000000 --- a/test/shim/runtime-handler/install.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -# A sample script for installing and configuring the gvisor-containerd-shim to -# use the containerd runtime handler. - -set -ex - -{ # Step 1: Create containerd config.toml -cat < /tmp/containerd-cri.log & -} diff --git a/test/shim/runtime-handler/test.sh b/test/shim/runtime-handler/test.sh deleted file mode 100755 index 99f3565b6..000000000 --- a/test/shim/runtime-handler/test.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -# Runs end-to-end tests for gvisor-containerd-shim to test the use of runtime -# handler. This should work on containerd 1.2+ - -# This is meant to be run in a VM as it makes a fairly invasive install of -# containerd. - -set -ex - -# Install containerd -. ./test/e2e/containerd-install.sh - -# Install gVisor -. ./test/e2e/runsc-install.sh - -# Install gvisor-containerd-shim -. ./test/e2e/shim-install.sh - -# Test installation/configuration -. ./test/e2e/runtime-handler/install.sh - -# Install crictl -. ./test/e2e/crictl-install.sh - -# Test usage -. ./test/e2e/runtime-handler/usage.sh - -# Run a container in the sandbox -. ./test/e2e/run-container.sh - -# Validate the pod and container -. ./test/e2e/validate.sh diff --git a/test/shim/runtime-handler/usage.sh b/test/shim/runtime-handler/usage.sh deleted file mode 100755 index 350c720c2..000000000 --- a/test/shim/runtime-handler/usage.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# A sample script for testing the gvisor-containerd-shim -# using runtime handler. - -set -ex - -{ # Step 1: Pull the nginx image -sudo crictl pull nginx -} - -{ # Step 2: Create sandbox.json -cat </tmp/containerd-cri.log & -} diff --git a/test/shim/untrusted-workload/test.sh b/test/shim/untrusted-workload/test.sh deleted file mode 100755 index 6e312cf6d..000000000 --- a/test/shim/untrusted-workload/test.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -# Runs end-to-end tests for gvisor-containerd-shim to test using the -# untrusted workload extension. This should work on containerd 1.1+ - -# This is meant to be run in a VM as it makes a fairly invasive install of -# containerd. - -set -ex - -# Install containerd -. ./test/e2e/containerd-install.sh - -# Install gVisor -. ./test/e2e/runsc-install.sh - -# Install gvisor-containerd-shim -. ./test/e2e/shim-install.sh - -# Test installation/configuration -. ./test/e2e/untrusted-workload/install.sh - -# Install crictl -. ./test/e2e/crictl-install.sh - -# Test usage -. ./test/e2e/untrusted-workload/usage.sh - -# Run a container in the sandbox -. ./test/e2e/run-container.sh - -# Validate the pod and container -. ./test/e2e/validate.sh diff --git a/test/shim/untrusted-workload/usage.sh b/test/shim/untrusted-workload/usage.sh deleted file mode 100755 index db8206964..000000000 --- a/test/shim/untrusted-workload/usage.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -# A sample script for testing the gvisor-containerd-shim # using untrusted -# workload extension. - -set -ex - -{ # Step 1: Pull the nginx image -sudo crictl pull nginx -} - -{ # Step 2: Create sandbox.json -cat < ${shim_config_path} <<-EOF + runc_shim = "/usr/bin/containerd-shim" + +[runsc_config] + debug = "true" + debug-log = "/tmp/runsc-logs/" + strace = "true" + file-access = "shared" +EOF +fi + +# Configure CNI. +(cd "${GOPATH}" && src/github.com/containerd/containerd/script/setup/install-cni) +cat </dev/null; then + service docker restart +fi diff --git a/tools/vm/ubuntu1604/25_docker.sh b/tools/vm/ubuntu1604/25_docker.sh deleted file mode 100755 index 53d8ca588..000000000 --- a/tools/vm/ubuntu1604/25_docker.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Add dependencies. -while true; do - if (apt-get update && apt-get install -y \ - apt-transport-https \ - ca-certificates \ - curl \ - gnupg-agent \ - software-properties-common); then - break - fi - result=$? - if [[ $result -ne 100 ]]; then - exit $result - fi -done - -# Install the key. -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - - -# Add the repository. -add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable" - -# Install docker. -while true; do - if (apt-get update && apt-get install -y \ - docker-ce \ - docker-ce-cli \ - containerd.io); then - break - fi - result=$? - if [[ $result -ne 100 ]]; then - exit $result - fi -done - -# Enable Docker IPv6. -cat > /etc/docker/daemon.json < ${shim_config_tmp_path} <<-EOF - runc_shim = "/usr/local/bin/containerd-shim" - -[runsc_config] - debug = "true" - debug-log = "/tmp/runsc-logs/" - strace = "true" - file-access = "shared" -EOF -mv ${shim_config_tmp_path} ${shim_config_path} - -# Configure CNI. -(cd "${GOPATH}" && GOPATH="${GOPATH}" \ - src/github.com/containerd/containerd/script/setup/install-cni) - -# Cleanup the above. -rm -rf "${GOPATH}" -rm -rf "${latest}" -rm -rf "${shim_path}" -rm -rf "${shim_config_tmp_path}" diff --git a/tools/vm/ubuntu1604/30_docker.sh b/tools/vm/ubuntu1604/30_docker.sh new file mode 100755 index 000000000..53d8ca588 --- /dev/null +++ b/tools/vm/ubuntu1604/30_docker.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Add dependencies. +while true; do + if (apt-get update && apt-get install -y \ + apt-transport-https \ + ca-certificates \ + curl \ + gnupg-agent \ + software-properties-common); then + break + fi + result=$? + if [[ $result -ne 100 ]]; then + exit $result + fi +done + +# Install the key. +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - + +# Add the repository. +add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable" + +# Install docker. +while true; do + if (apt-get update && apt-get install -y \ + docker-ce \ + docker-ce-cli \ + containerd.io); then + break + fi + result=$? + if [[ $result -ne 100 ]]; then + exit $result + fi +done + +# Enable Docker IPv6. +cat > /etc/docker/daemon.json < Date: Fri, 17 Jul 2020 18:26:08 -0700 Subject: Clean up html on the website. - Fixes some html validation issues. - Fixes links on security basics blog post. - Adds rel=noopener to links with target=_blank and adds a check to htmlproofer. - Add favicon check to htmlproofer. Fixes #3286 Fixes #3284 PiperOrigin-RevId: 321892602 --- g3doc/README.md | 10 +++------ images/jekyll/Dockerfile | 1 + images/jekyll/checks.rb | 36 ++++++++++++++++++++++++++++++ tools/bazeldefs/defs.bzl | 3 +++ tools/defs.bzl | 3 ++- website/BUILD | 4 +--- website/_includes/footer.html | 2 +- website/_includes/graph.html | 2 +- website/_includes/header-links.html | 2 +- website/_layouts/docs.html | 4 ++-- website/blog/2019-11-18-security-basics.md | 23 ++++++++++++------- website/defs.bzl | 4 +++- 12 files changed, 69 insertions(+), 25 deletions(-) create mode 100644 images/jekyll/checks.rb (limited to 'g3doc') diff --git a/g3doc/README.md b/g3doc/README.md index 7956fe739..22bfb15f7 100644 --- a/g3doc/README.md +++ b/g3doc/README.md @@ -117,9 +117,7 @@ for more information on filesystem bundles. `runsc` implements multiple commands that perform various functions such as starting, stopping, listing, and querying the status of containers. -### Sentry - - +### Sentry {#sentry} The Sentry is the largest component of gVisor. It can be thought of as a application kernel. The Sentry implements all the kernel functionality needed by @@ -136,9 +134,7 @@ calls it makes. For example, the Sentry is not able to open files directly; file system operations that extend beyond the sandbox (not internal `/proc` files, pipes, etc) are sent to the Gofer, described below. -### Gofer - - +### Gofer {#gofer} The Gofer is a standard host process which is started with each container and communicates with the Sentry via the [9P protocol][9p] over a socket or shared @@ -146,7 +142,7 @@ memory channel. The Sentry process is started in a restricted seccomp container without access to file system resources. The Gofer mediates all access to the these resources, providing an additional level of isolation. -### Application +### Application {#application} The application is a normal Linux binary provided to gVisor in an OCI runtime bundle. gVisor aims to provide an environment equivalent to Linux v4.4, so diff --git a/images/jekyll/Dockerfile b/images/jekyll/Dockerfile index 4860dd750..ba039ba15 100644 --- a/images/jekyll/Dockerfile +++ b/images/jekyll/Dockerfile @@ -10,4 +10,5 @@ RUN gem install \ jekyll-relative-links:0.6.1 \ jekyll-feed:0.13.0 \ jekyll-sitemap:1.4.0 +COPY checks.rb /checks.rb CMD ["/usr/gem/gems/jekyll-4.0.0/exe/jekyll", "build", "-t", "-s", "/input", "-d", "/output"] diff --git a/images/jekyll/checks.rb b/images/jekyll/checks.rb new file mode 100644 index 000000000..fc7e6b5a8 --- /dev/null +++ b/images/jekyll/checks.rb @@ -0,0 +1,36 @@ +#!/usr/local/bin/ruby +# +# HTMLProofer checks for the gVisor website. +# +require 'html-proofer' + +# NoOpenerCheck checks to make sure links with target=_blank include the +# rel=noopener attribute. +class NoOpenerCheck < ::HTMLProofer::Check + def run + @html.css('a').each do |node| + link = create_element(node) + line = node.line + + rel = link.respond_to?(:rel) ? link.rel.split(' ') : [] + + if link.respond_to?(:target) && link.target == "_blank" && !rel.include?("noopener") + return add_issue("You should set rel=noopener for links with target=_blank", line: line) + end + end + end +end + +def main() + options = { + :check_html => true, + :check_favicon => true, + :disable_external => true, + } + + HTMLProofer.check_directories(ARGV, options).run +end + +if __FILE__ == $0 + main +end diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl index 620c460de..3db8e13d0 100644 --- a/tools/bazeldefs/defs.bzl +++ b/tools/bazeldefs/defs.bzl @@ -32,6 +32,9 @@ rbe_platform = native.platform rbe_toolchain = native.toolchain vdso_linker_option = "-fuse-ld=gold " +def short_path(path): + return path + def proto_library(name, has_services = None, **kwargs): native.proto_library( name = name, diff --git a/tools/defs.bzl b/tools/defs.bzl index 40afcdb79..e35e29634 100644 --- a/tools/defs.bzl +++ b/tools/defs.bzl @@ -7,7 +7,7 @@ change for Google-internal and bazel-compatible rules. load("//tools/go_stateify:defs.bzl", "go_stateify") load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps") -load("//tools/bazeldefs:defs.bzl", _build_test = "build_test", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _default_installer = "default_installer", _default_net_util = "default_net_util", _gazelle = "gazelle", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_library = "go_library", _go_path = "go_path", _go_proto_library = "go_proto_library", _go_test = "go_test", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _rbe_platform = "rbe_platform", _rbe_toolchain = "rbe_toolchain", _select_arch = "select_arch", _select_system = "select_system", _vdso_linker_option = "vdso_linker_option") +load("//tools/bazeldefs:defs.bzl", _build_test = "build_test", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _default_installer = "default_installer", _default_net_util = "default_net_util", _gazelle = "gazelle", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_library = "go_library", _go_path = "go_path", _go_proto_library = "go_proto_library", _go_test = "go_test", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _rbe_platform = "rbe_platform", _rbe_toolchain = "rbe_toolchain", _select_arch = "select_arch", _select_system = "select_system", _short_path = "short_path", _vdso_linker_option = "vdso_linker_option") load("//tools/bazeldefs:platforms.bzl", _default_platform = "default_platform", _platforms = "platforms") load("//tools/bazeldefs:tags.bzl", "go_suffixes") load("//tools/nogo:defs.bzl", "nogo_test") @@ -38,6 +38,7 @@ py_requirement = _py_requirement py_test = _py_test select_arch = _select_arch select_system = _select_system +short_path = _short_path rbe_platform = _rbe_platform rbe_toolchain = _rbe_toolchain vdso_linker_option = _vdso_linker_option diff --git a/website/BUILD b/website/BUILD index 4488cb543..10e0299ae 100644 --- a/website/BUILD +++ b/website/BUILD @@ -55,9 +55,7 @@ genrule( "docker run -i --user $$(id -u):$$(id -g) " + "-v $$(readlink -m $$T/output/_site):/output " + "gvisor.dev/images/jekyll " + - "/usr/gem/bin/htmlproofer " + - "--disable-external " + - "--check-html " + + "ruby /checks.rb " + "/output && " + "cp $(location //website/cmd/server) $$T/output/server && " + "tar -zcf $@ -C $$T/output . && " + diff --git a/website/_includes/footer.html b/website/_includes/footer.html index 9cc8176f7..c1a373329 100644 --- a/website/_includes/footer.html +++ b/website/_includes/footer.html @@ -8,7 +8,7 @@ {% if site.analytics %} -