diff options
103 files changed, 2389 insertions, 682 deletions
@@ -129,10 +129,9 @@ tests: unit-tests @$(call submake,test TARGETS="test/syscalls/...") .PHONY: tests - integration-tests: ## Run all standard integration tests. integration-tests: docker-tests overlay-tests hostnet-tests swgso-tests -integration-tests: do-tests kvm-tests root-tests containerd-tests +integration-tests: do-tests kvm-tests containerd-test-1.3.4 .PHONY: integration-tests network-tests: ## Run all networking integration tests. @@ -186,6 +185,7 @@ swgso-tests: load-basic-images @$(call submake,install-test-runtime RUNTIME="swgso" ARGS="--software-gso=true --gso=false") @$(call submake,test-runtime RUNTIME="swgso" TARGETS="$(INTEGRATION_TARGETS)") .PHONY: swgso-tests + hostnet-tests: load-basic-images @$(call submake,install-test-runtime RUNTIME="hostnet" ARGS="--network=host") @$(call submake,test-runtime RUNTIME="hostnet" OPTIONS="--test_arg=-checkpoint=false" TARGETS="$(INTEGRATION_TARGETS)") @@ -200,6 +200,8 @@ kvm-tests: load-basic-images .PHONY: kvm-tests iptables-tests: load-iptables + @sudo modprobe iptable_filter + @sudo modprobe ip6table_filter @$(call submake,test-runtime RUNTIME="runc" TARGETS="//test/iptables:iptables_test") @$(call submake,install-test-runtime RUNTIME="iptables" ARGS="--net-raw") @$(call submake,test-runtime RUNTIME="iptables" TARGETS="//test/iptables:iptables_test") @@ -217,16 +219,12 @@ packetimpact-tests: load-packetimpact @$(call submake,test-runtime OPTIONS="--jobs=HOST_CPUS*3 --local_test_jobs=HOST_CPUS*3" RUNTIME="packetimpact" TARGETS="$(shell $(MAKE) query TARGETS='attr(tags, packetimpact, tests(//...))')") .PHONY: packetimpact-tests -root-tests: load-basic-images - @$(call submake,install-test-runtime) - @$(call submake,sudo TARGETS="//test/root:root_test" ARGS="-test.v") -.PHONY: root-tests - # Specific containerd version tests. -containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd install-test-runtime +containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd load-basic_ubuntu + @$(call submake,install-test-runtime RUNTIME="root") @CONTAINERD_VERSION=$* $(MAKE) sudo TARGETS="tools/installers:containerd" @$(MAKE) sudo TARGETS="tools/installers:shim" - @$(MAKE) sudo TARGETS="test/root:root_test" ARGS="-test.v" + @$(MAKE) sudo TARGETS="test/root:root_test" ARGS="--runtime=root -test.v" # Note that we can't run containerd-test-1.1.8 tests here. # diff --git a/g3doc/user_guide/FAQ.md b/g3doc/user_guide/FAQ.md index 8b5cff0ab..514fe3918 100644 --- a/g3doc/user_guide/FAQ.md +++ b/g3doc/user_guide/FAQ.md @@ -74,11 +74,10 @@ directories. ### I'm getting an error like: `panic: unable to attach: operation not permitted` or `fork/exec /proc/self/exe: invalid argument: unknown` {#runsc-perms} -Make sure that permissions and the owner is correct on the `runsc` binary. +Make sure that permissions is correct on the `runsc` binary. ```bash -sudo chown root:root /usr/local/bin/runsc -sudo chmod 0755 /usr/local/bin/runsc +sudo chmod a+rx /usr/local/bin/runsc ``` ### I'm getting an error like `mount submount "/etc/hostname": creating mount with source ".../hostname": input/output error: unknown.` {#memlock} diff --git a/g3doc/user_guide/install.md b/g3doc/user_guide/install.md index 9afdd264d..abb9e8582 100644 --- a/g3doc/user_guide/install.md +++ b/g3doc/user_guide/install.md @@ -5,6 +5,68 @@ > Note: gVisor supports only x86\_64 and requires Linux 4.14.77+ > ([older Linux](./networking.md#gso)). +## Install latest release {#install-latest} + +To download and install the latest release manually follow these steps: + +```bash +( + set -e + URL=https://storage.googleapis.com/gvisor/releases/release/latest + wget ${URL}/runsc ${URL}/runsc.sha512 + sha512sum -c runsc.sha512 + rm -f runsc.sha512 + sudo mv runsc /usr/local/bin + sudo chmod a+rx /usr/local/bin/runsc +) +``` + +To install gVisor with Docker, run the following commands: + +```bash +/usr/local/bin/runsc install +sudo systemctl restart docker +docker run --rm --runtime=runsc hello-world +``` + +For more details about using gVisor with Docker, see +[Docker Quick Start](./quick_start/docker.md) + +Note: It is important to copy `runsc` to a location that is readable and +executable to all users, since `runsc` executes itself as user `nobody` to avoid +unnecessary privileges. The `/usr/local/bin` directory is a good place to put +the `runsc` binary. + +## Install from an `apt` repository + +First, appropriate dependencies must be installed to allow `apt` to install +packages via https: + +```bash +sudo apt-get update && \ +sudo apt-get install -y \ + apt-transport-https \ + ca-certificates \ + curl \ + gnupg-agent \ + software-properties-common +``` + +Next, the configure the key used to sign archives and the repository: + +```bash +curl -fsSL https://gvisor.dev/archive.key | sudo apt-key add - +sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main" +``` + +Now the runsc package can be installed: + +```bash +sudo apt-get update && sudo apt-get install -y runsc +``` + +If you have Docker installed, it will be automatically configured. + ## Versions The `runsc` binaries and repositories are available in multiple versions and @@ -21,12 +83,16 @@ Binaries are available for every commit on the `master` branch, and are available at the following URL: `https://storage.googleapis.com/gvisor/releases/master/latest/runsc` +`https://storage.googleapis.com/gvisor/releases/master/latest/runsc.sha512` -Checksums for the release binary are at: +You can use this link with the steps described in +[Install latest release](#install-latest). -`https://storage.googleapis.com/gvisor/releases/master/latest/runsc.sha512` +For `apt` installation, use the `master` to configure the repository: -For `apt` installation, use the `master` as the `${DIST}` below. +```bash +sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases master main" +``` ### Nightly @@ -34,18 +100,22 @@ Nightly releases are built most nights from the master branch, and are available at the following URL: `https://storage.googleapis.com/gvisor/releases/nightly/latest/runsc` - -Checksums for the release binary are at: - `https://storage.googleapis.com/gvisor/releases/nightly/latest/runsc.sha512` +You can use this link with the steps described in +[Install latest release](#install-latest). + Specific nightly releases can be found at: `https://storage.googleapis.com/gvisor/releases/nightly/${yyyy-mm-dd}/runsc` Note that a release may not be available for every day. -For `apt` installation, use the `nightly` as the `${DIST}` below. +For `apt` installation, use the `nightly` to configure the repository: + +```bash +sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases nightly main" +``` ### Latest release @@ -53,105 +123,47 @@ The latest official release is available at the following URL: `https://storage.googleapis.com/gvisor/releases/release/latest` -For `apt` installation, use the `release` as the `${DIST}` below. - -### Specific release - -A given release release is available at the following URL: - -`https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}` - -See the [releases][releases] page for information about specific releases. - -For `apt` installation of a specific release, which may include point updates, -use the date of the release, e.g. `${yyyymmdd}`, as the `${DIST}` below. - -> Note: only newer releases may be available as `apt` repositories. - -### Point release - -A given point release is available at the following URL: - -`https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}.${rc}` - -Note that `apt` installation of a specific point release is not supported. - -## Install from an `apt` repository +You can use this link with the steps described in +[Install latest release](#install-latest). -First, appropriate dependencies must be installed to allow `apt` to install -packages via https: +For `apt` installation, use the `release` to configure the repository: ```bash -sudo apt-get update && \ -sudo apt-get install -y \ - apt-transport-https \ - ca-certificates \ - curl \ - gnupg-agent \ - software-properties-common +sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main" ``` -Next, the key used to sign archives should be added to your `apt` keychain: - -```bash -curl -fsSL https://gvisor.dev/archive.key | sudo apt-key add - -``` +### Specific release -Based on the release type, you will need to substitute `${DIST}` below, using -one of: +A given release release is available at the following URL: -* `master`: For HEAD. -* `nightly`: For nightly releases. -* `release`: For the latest release. -* `${yyyymmdd}`: For a specific releases (see above). +`https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}` -The repository for the release you wish to install should be added: +You can use this link with the steps described in +[Install latest release](#install-latest). -```bash -sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases ${DIST} main" -``` +See the [releases](https://github.com/google/gvisor/releases) page for +information about specific releases. -For example, to install the latest official release, you can use: +For `apt` installation of a specific release, which may include point updates, +use the date of the release for repository, e.g. `${yyyymmdd}`. ```bash -sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main" +sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases yyyymmdd main" ``` -Now the runsc package can be installed: - -```bash -sudo apt-get update && sudo apt-get install -y runsc -``` +> Note: only newer releases may be available as `apt` repositories. -If you have Docker installed, it will be automatically configured. +### Point release -## Install directly +A given point release is available at the following URL: -The binary URLs provided above can be used to install directly. For example, the -latest nightly binary can be downloaded, validated, and placed in an appropriate -location by running: +`https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}.${rc}` -```bash -( - set -e - URL=https://storage.googleapis.com/gvisor/releases/nightly/latest - wget ${URL}/runsc - wget ${URL}/runsc.sha512 - sha512sum -c runsc.sha512 - rm -f runsc.sha512 - sudo mv runsc /usr/local/bin - sudo chown root:root /usr/local/bin/runsc - sudo chmod 0755 /usr/local/bin/runsc -) -``` +You can use this link with the steps described in +[Install latest release](#install-latest). -**It is important to copy this binary to a location that is accessible to all -users, and ensure it is executable by all users**, since `runsc` executes itself -as user `nobody` to avoid unnecessary privileges. The `/usr/local/bin` directory -is a good place to put the `runsc` binary. +Note that `apt` installation of a specific point release is not supported. After installation, try out `runsc` by following the [Docker Quick Start](./quick_start/docker.md) or [OCI Quick Start](./quick_start/oci.md). - -[releases]: https://github.com/google/gvisor/releases diff --git a/g3doc/user_guide/quick_start/docker.md b/g3doc/user_guide/quick_start/docker.md index 6ad594ecc..ee842e453 100644 --- a/g3doc/user_guide/quick_start/docker.md +++ b/g3doc/user_guide/quick_start/docker.md @@ -22,18 +22,6 @@ named "runsc" by default. sudo runsc install ``` -You may also wish to install a runtime entry for debugging. The `runsc install` -command can accept options that will be passed to the runtime when it is invoked -by Docker. - -```bash -sudo runsc install --runtime runsc-debug -- \ - --debug \ - --debug-log=/tmp/runsc-debug.log \ - --strace \ - --log-packets -``` - You must restart the Docker daemon after installing the runtime. Typically this is done via `systemd`: @@ -85,6 +73,21 @@ $ docker run --runtime=runsc -it ubuntu dmesg Note that this is easily replicated by an attacker so applications should never use `dmesg` to verify the runtime in a security sensitive context. +## Options + +You may also wish to install a runtime entry with different options. The `runsc +install` command can accept flags that will be passed to the runtime when it is +invoked by Docker. For example, to install a runtime with debugging enabled, run +the following: + +```bash +sudo runsc install --runtime runsc-debug -- \ + --debug \ + --debug-log=/tmp/runsc-debug.log \ + --strace \ + --log-packets +``` + Next, look at the different options available for gVisor: [platform][platforms], [network][networking], [filesystem][filesystem]. diff --git a/images/jekyll/build.sh b/images/jekyll/build.sh index bfceb2781..010972ea6 100755 --- a/images/jekyll/build.sh +++ b/images/jekyll/build.sh @@ -18,4 +18,5 @@ set -euxo pipefail # Generate the syntax highlighting css file. /usr/gem/bin/rougify style github >/input/_sass/syntax.css -/usr/gem/bin/jekyll build -t -s /input -d /output +# Build website including pages irrespective of date. +/usr/gem/bin/jekyll build --future -t -s /input -d /output diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD index abe58f818..4c8604d58 100644 --- a/pkg/sentry/devices/memdev/BUILD +++ b/pkg/sentry/devices/memdev/BUILD @@ -18,9 +18,10 @@ go_library( "//pkg/rand", "//pkg/safemem", "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/mm", - "//pkg/sentry/pgalloc", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go index 2e631a252..60cfea888 100644 --- a/pkg/sentry/devices/memdev/zero.go +++ b/pkg/sentry/devices/memdev/zero.go @@ -16,9 +16,10 @@ package memdev import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) @@ -79,11 +80,22 @@ func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64, // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx)) + if opts.Private || !opts.MaxPerms.Write { + // This mapping will never permit writing to the "underlying file" (in + // Linux terms, it isn't VM_SHARED), so implement it as an anonymous + // mapping, but back it with fd; this is what Linux does, and is + // actually application-visible because the resulting VMA will show up + // in /proc/[pid]/maps with fd.vfsfd.VirtualDentry()'s path rather than + // "/dev/zero (deleted)". + opts.Offset = 0 + opts.MappingIdentity = &fd.vfsfd + opts.MappingIdentity.IncRef() + return nil + } + tmpfsFD, err := tmpfs.NewZeroFile(ctx, auth.CredentialsFromContext(ctx), kernel.KernelFromContext(ctx).ShmMount(), opts.Length) if err != nil { return err } - opts.MappingIdentity = m - opts.Mappable = m - return nil + defer tmpfsFD.DecRef(ctx) + return tmpfsFD.ConfigureMMap(ctx, opts) } diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md index 8f5e45005..05e043583 100644 --- a/pkg/sentry/fs/g3doc/fuse.md +++ b/pkg/sentry/fs/g3doc/fuse.md @@ -143,8 +143,8 @@ ops can be implemented in parallel. #### Basic I/O Implementation -Currently we have implemented basic functionalities of read and write -for our FUSE. We describe the design and ways to improve it here: +Currently we have implemented basic functionalities of read and write for our +FUSE. We describe the design and ways to improve it here: ##### Basic FUSE Read @@ -153,38 +153,37 @@ The vfs2 expects implementations of `vfs.FileDescriptionImpl.Read()` and reach our implementation of those interface functions located at `pkg/sentry/fsimpl/fuse/regular_file.go` for regular files. -After validation checks of the input, sentry sends `FUSE_READ` requests -to the FUSE daemon. The FUSE daemon returns data after the `fuse_out_header` -as the responses. For the first version, we create a copy in kernel memory -of those data. They are represented as a byte slice in the marshalled -struct. This happens as a common process for all the FUSE responses at this -moment at `pkg/sentry/fsimpl/fuse/dev.go:writeLocked()`. We then directly -copy from this intermediate buffer to the input buffer -provided by the read syscall. - -There is an extra requirement for FUSE: When mounting the FUSE fs, -the mounter or the FUSE daemon can specify a `max_read` or a `max_pages` -parameter. They are the upperbound of the bytes to read in each -`FUSE_READ` request. We implemented the code to handle the fragmented reads. - -To improve the performance: ideally we should have buffer cache to copy -those data from the responses of FUSE daemon into, as is also the -design of several other existing file system implementations for -sentry, instead of a single-use temporary buffer. Directly mapping -the memory of one process to another could also boost the performance, -but to keep them isolated, we did not choose to do so. +After validation checks of the input, sentry sends `FUSE_READ` requests to the +FUSE daemon. The FUSE daemon returns data after the `fuse_out_header` as the +responses. For the first version, we create a copy in kernel memory of those +data. They are represented as a byte slice in the marshalled struct. This +happens as a common process for all the FUSE responses at this moment at +`pkg/sentry/fsimpl/fuse/dev.go:writeLocked()`. We then directly copy from this +intermediate buffer to the input buffer provided by the read syscall. + +There is an extra requirement for FUSE: When mounting the FUSE fs, the mounter +or the FUSE daemon can specify a `max_read` or a `max_pages` parameter. They are +the upperbound of the bytes to read in each `FUSE_READ` request. We implemented +the code to handle the fragmented reads. + +To improve the performance: ideally we should have buffer cache to copy those +data from the responses of FUSE daemon into, as is also the design of several +other existing file system implementations for sentry, instead of a single-use +temporary buffer. Directly mapping the memory of one process to another could +also boost the performance, but to keep them isolated, we did not choose to do +so. ##### Basic FUSE Write -The vfs2 invokes implementations of `vfs.FileDescriptionImpl.Write()` -and `vfs.FileDescriptionImpl.PWrite()` on the regular file descriptor -of FUSE when a user makes write(2) and pwrite(2) syscall. +The vfs2 invokes implementations of `vfs.FileDescriptionImpl.Write()` and +`vfs.FileDescriptionImpl.PWrite()` on the regular file descriptor of FUSE when a +user makes write(2) and pwrite(2) syscall. -For valid writes, sentry sends the bytes to write after a `FUSE_WRITE` -header (can be regarded as a request with 2 payloads) to the FUSE daemon. -For the first version, we allocate a buffer inside kernel memory to store -the bytes from the user, and copy directly from that buffer to the memory -of FUSE daemon. This happens at `pkg/sentry/fsimpl/fuse/dev.go:readLocked()` +For valid writes, sentry sends the bytes to write after a `FUSE_WRITE` header +(can be regarded as a request with 2 payloads) to the FUSE daemon. For the first +version, we allocate a buffer inside kernel memory to store the bytes from the +user, and copy directly from that buffer to the memory of FUSE daemon. This +happens at `pkg/sentry/fsimpl/fuse/dev.go:readLocked()` The parameters `max_write` and `max_pages` restrict the number of bytes in one `FUSE_WRITE`. There are code handling fragmented writes in current diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index dcf1ee25b..f0f2e0be7 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -237,7 +237,7 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, return offset, nil } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *rootInode) DecRef(context.Context) { i.rootInodeRefs.DecRef(i.Destroy) } diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index 812171fa3..bb0bf3a07 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -30,7 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// EventFileDescription implements FileDescriptionImpl for file-based event +// EventFileDescription implements vfs.FileDescriptionImpl for file-based event // notification (eventfd). Eventfds are usually internal to the Sentry but in // certain situations they may be converted into a host-backed eventfd. type EventFileDescription struct { @@ -106,7 +106,7 @@ func (efd *EventFileDescription) HostFD() (int, error) { return efd.hostfd, nil } -// Release implements FileDescriptionImpl.Release() +// Release implements vfs.FileDescriptionImpl.Release. func (efd *EventFileDescription) Release(context.Context) { efd.mu.Lock() defer efd.mu.Unlock() @@ -119,7 +119,7 @@ func (efd *EventFileDescription) Release(context.Context) { } } -// Read implements FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { if dst.NumBytes() < 8 { return 0, syscall.EINVAL @@ -130,7 +130,7 @@ func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequenc return 8, nil } -// Write implements FileDescriptionImpl.Write. +// Write implements vfs.FileDescriptionImpl.Write. func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { if src.NumBytes() < 8 { return 0, syscall.EINVAL diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 075de0e22..a4a6d8c55 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -490,7 +490,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EROFS } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { _, inode, err := fs.walk(ctx, rp, false) if err != nil { diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index 2fd0d1fa8..f33592d59 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -61,7 +61,7 @@ func (in *inode) isSymlink() bool { return ok } -// symlinkFD represents a symlink file description and implements implements +// symlinkFD represents a symlink file description and implements // vfs.FileDescriptionImpl. which may only be used if open options contains // O_PATH. For this reason most of the functions return EBADF. type symlinkFD struct { diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index 5539466ff..f690ef5ad 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -95,9 +95,14 @@ type DeviceFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DeviceFD) Release(context.Context) { +func (fd *DeviceFD) Release(ctx context.Context) { if fd.fs != nil { + fd.fs.conn.mu.Lock() fd.fs.conn.connected = false + fd.fs.conn.mu.Unlock() + + fd.fs.VFSFilesystem().DecRef(ctx) + fd.fs = nil } } diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go index ee77dbaa3..8f220a04b 100644 --- a/pkg/sentry/fsimpl/fuse/directory.go +++ b/pkg/sentry/fsimpl/fuse/directory.go @@ -35,27 +35,27 @@ func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) e return syserror.EISDIR } -// PRead implements FileDescriptionImpl.PRead. +// PRead implements vfs.FileDescriptionImpl.PRead. func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, syserror.EISDIR } -// Read implements FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { return 0, syserror.EISDIR } -// PWrite implements FileDescriptionImpl.PWrite. +// PWrite implements vfs.FileDescriptionImpl.PWrite. func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return 0, syserror.EISDIR } -// Write implements FileDescriptionImpl.Write. +// Write implements vfs.FileDescriptionImpl.Write. func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { return 0, syserror.EISDIR } -// IterDirents implements FileDescriptionImpl.IterDirents. +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error { fusefs := dir.inode().fs task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index f1ffd2343..b3573f80d 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -218,6 +218,7 @@ func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt conn: conn, } + fs.VFSFilesystem().IncRef() fuseFD.fs = fs return fs, nil @@ -734,7 +735,7 @@ func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptio return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *inode) DecRef(context.Context) { i.inodeRefs.DecRef(i.Destroy) } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 4a592175b..97b9165cc 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -1311,6 +1311,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if !renamed.isDir() { return syserror.EISDIR } + if genericIsAncestorDentry(replaced, renamed) { + return syserror.ENOTEMPTY + } } else { if rp.MustBeDir() || renamed.isDir() { return syserror.ENOTDIR @@ -1361,14 +1364,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // with reference counts and queue oldParent for checkCachingLocked if the // parent isn't actually changing. if oldParent != newParent { + oldParent.decRefLocked() ds = appendDentry(ds, oldParent) newParent.IncRef() if renamed.isSynthetic() { oldParent.syntheticChildren-- newParent.syntheticChildren++ } + renamed.parent = newParent } - renamed.parent = newParent renamed.name = newName if newParent.children == nil { newParent.children = make(map[string]*dentry) @@ -1491,7 +1495,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return fs.unlinkAt(ctx, rp, false /* dir */) } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index c4551562a..db8536f26 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -139,12 +139,12 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs // filesystemType implements vfs.FilesystemType. type filesystemType struct{} -// GetFilesystem implements FilesystemType.GetFilesystem. +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("host.filesystemType.GetFilesystem should never be called") } -// Name implements FilesystemType.Name. +// Name implements vfs.FilesystemType.Name. func (filesystemType) Name() string { return "none" } @@ -243,7 +243,7 @@ type inode struct { pf inodePlatformFile } -// CheckPermissions implements kernfs.Inode. +// CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { @@ -252,7 +252,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) } -// Mode implements kernfs.Inode. +// Mode implements kernfs.Inode.Mode. func (i *inode) Mode() linux.FileMode { var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { @@ -263,7 +263,7 @@ func (i *inode) Mode() linux.FileMode { return linux.FileMode(s.Mode) } -// Stat implements kernfs.Inode. +// Stat implements kernfs.Inode.Stat. func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { if opts.Mask&linux.STATX__RESERVED != 0 { return linux.Statx{}, syserror.EINVAL @@ -376,7 +376,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { }, nil } -// SetStat implements kernfs.Inode. +// SetStat implements kernfs.Inode.SetStat. func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { s := &opts.Stat @@ -435,17 +435,17 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre return nil } -// IncRef implements kernfs.Inode. +// IncRef implements kernfs.Inode.IncRef. func (i *inode) IncRef() { i.refs.IncRef() } -// TryIncRef implements kernfs.Inode. +// TryIncRef implements kernfs.Inode.TryIncRef. func (i *inode) TryIncRef() bool { return i.refs.TryIncRef() } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *inode) DecRef(ctx context.Context) { i.refs.DecRef(func() { if i.wouldBlock { @@ -457,7 +457,7 @@ func (i *inode) DecRef(ctx context.Context) { }) } -// Open implements kernfs.Inode. +// Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Once created, we cannot re-open a socket fd through /proc/[pid]/fd/. if i.Mode().FileType() == linux.S_IFSOCK { @@ -542,28 +542,28 @@ type fileDescription struct { offset int64 } -// SetStat implements vfs.FileDescriptionImpl. +// SetStat implements vfs.FileDescriptionImpl.SetStat. func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts) } -// Stat implements vfs.FileDescriptionImpl. +// Stat implements vfs.FileDescriptionImpl.Stat. func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts) } -// Release implements vfs.FileDescriptionImpl. +// Release implements vfs.FileDescriptionImpl.Release. func (f *fileDescription) Release(context.Context) { // noop } -// Allocate implements vfs.FileDescriptionImpl. +// Allocate implements vfs.FileDescriptionImpl.Allocate. func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length)) } -// PRead implements FileDescriptionImpl. +// PRead implements vfs.FileDescriptionImpl.PRead. func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { i := f.inode if !i.seekable { @@ -573,7 +573,7 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) } -// Read implements FileDescriptionImpl. +// Read implements vfs.FileDescriptionImpl.Read. func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { i := f.inode if !i.seekable { @@ -610,7 +610,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off return int64(n), err } -// PWrite implements FileDescriptionImpl. +// PWrite implements vfs.FileDescriptionImpl.PWrite. func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { if !f.inode.seekable { return 0, syserror.ESPIPE @@ -619,7 +619,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of return f.writeToHostFD(ctx, src, offset, opts.Flags) } -// Write implements FileDescriptionImpl. +// Write implements vfs.FileDescriptionImpl.Write. func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { i := f.inode if !i.seekable { @@ -667,7 +667,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque return int64(n), err } -// Seek implements FileDescriptionImpl. +// Seek implements vfs.FileDescriptionImpl.Seek. // // Note that we do not support seeking on directories, since we do not even // allow directory fds to be imported at all. @@ -732,13 +732,13 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i return f.offset, nil } -// Sync implements FileDescriptionImpl. +// Sync implements vfs.FileDescriptionImpl.Sync. func (f *fileDescription) Sync(context.Context) error { // TODO(gvisor.dev/issue/1897): Currently, we always sync everything. return unix.Fsync(f.inode.hostFD) } -// ConfigureMMap implements FileDescriptionImpl. +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { if !f.inode.canMap { return syserror.ENODEV diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 97cefa350..e02b9b8f6 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -76,7 +76,7 @@ func (t *TTYFileDescription) Release(ctx context.Context) { t.fileDescription.Release(ctx) } -// PRead implements vfs.FileDescriptionImpl. +// PRead implements vfs.FileDescriptionImpl.PRead. // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. @@ -94,7 +94,7 @@ func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence, return t.fileDescription.PRead(ctx, dst, offset, opts) } -// Read implements vfs.FileDescriptionImpl. +// Read implements vfs.FileDescriptionImpl.Read. // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. @@ -112,7 +112,7 @@ func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, o return t.fileDescription.Read(ctx, dst, opts) } -// PWrite implements vfs.FileDescriptionImpl. +// PWrite implements vfs.FileDescriptionImpl.PWrite. func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -127,7 +127,7 @@ func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, return t.fileDescription.PWrite(ctx, src, offset, opts) } -// Write implements vfs.FileDescriptionImpl. +// Write implements vfs.FileDescriptionImpl.Write. func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -142,7 +142,7 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, return t.fileDescription.Write(ctx, src, opts) } -// Ioctl implements vfs.FileDescriptionImpl. +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index c659436ac..89ed265dc 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -801,7 +801,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return nil } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() _, inode, err := fs.walkExistingLocked(ctx, rp) diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index c2109cf76..6ee353ace 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -594,7 +594,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3 s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) } -// Open implements kernfs.Inode. +// Open implements kernfs.Inode.Open. func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts, s.fdOpts) if err != nil { @@ -608,7 +608,7 @@ func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credenti return syserror.EPERM } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (s *StaticDirectory) DecRef(context.Context) { s.StaticDirectoryRefs.DecRef(s.Destroy) } @@ -616,7 +616,7 @@ func (s *StaticDirectory) DecRef(context.Context) { // AlwaysValid partially implements kernfs.inodeDynamicLookup. type AlwaysValid struct{} -// Valid implements kernfs.inodeDynamicLookup. +// Valid implements kernfs.inodeDynamicLookup.Valid. func (*AlwaysValid) Valid(context.Context) bool { return true } diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index a9812fcef..443121c99 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -51,7 +51,7 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777) } -// Readlink implements Inode. +// Readlink implements Inode.Readlink. func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) { return s.target, nil } diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 360b77ef6..73b126669 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -92,7 +92,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop) } if err != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)) } if d.upperVD.Ok() { d.upperVD.DecRef(ctx) @@ -362,7 +362,7 @@ func (d *dentry) copyXattrsLocked(ctx context.Context) error { // There are no guarantees as to the contents of lowerXattrs. return nil } - ctx.Warningf("failed to copy up xattrs because ListXattrAt failed: %v", err) + ctx.Infof("failed to copy up xattrs because ListXattrAt failed: %v", err) return err } @@ -374,14 +374,48 @@ func (d *dentry) copyXattrsLocked(ctx context.Context) error { value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0}) if err != nil { - ctx.Warningf("failed to copy up xattrs because GetXattrAt failed: %v", err) + ctx.Infof("failed to copy up xattrs because GetXattrAt failed: %v", err) return err } if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil { - ctx.Warningf("failed to copy up xattrs because SetXattrAt failed: %v", err) + ctx.Infof("failed to copy up xattrs because SetXattrAt failed: %v", err) return err } } return nil } + +// copyUpDescendantsLocked ensures that all descendants of d are copied up. +// +// Preconditions: +// * filesystem.renameMu must be locked. +// * d.dirMu must be locked. +// * d.isDir(). +func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) error { + dirents, err := d.getDirentsLocked(ctx) + if err != nil { + return err + } + for _, dirent := range dirents { + if dirent.Name == "." || dirent.Name == ".." { + continue + } + child, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds) + if err != nil { + return err + } + if err := child.copyUpLocked(ctx); err != nil { + return err + } + if child.isDir() { + child.dirMu.Lock() + err := child.copyUpDescendantsLocked(ctx, ds) + child.dirMu.Unlock() + if err != nil { + return err + } + } + } + return nil +} diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go index b1b292e83..7ab42e71e 100644 --- a/pkg/sentry/fsimpl/overlay/directory.go +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -143,7 +143,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { defer d.fs.renameMu.RUnlock() d.dirMu.Lock() defer d.dirMu.Unlock() + return d.getDirentsLocked(ctx) +} +// Preconditions: +// * filesystem.renameMu must be locked. +// * d.dirMu must be locked. +// * d.isDir(). +func (d *dentry) getDirentsLocked(ctx context.Context) ([]vfs.Dirent, error) { if d.dirents != nil { return d.dirents, nil } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index b530851c5..e9ce4bde1 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -15,6 +15,7 @@ package overlay import ( + "fmt" "strings" "sync/atomic" @@ -512,7 +513,7 @@ func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFil func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) { if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)) } } @@ -624,7 +625,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. }, }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop) } @@ -663,7 +664,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v }, }); err != nil { if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -678,7 +679,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v Value: "y", }); err != nil { if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)) } else { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -722,7 +723,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v }, }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -942,7 +943,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving }, }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -953,7 +954,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving child, err := fs.getChildLocked(ctx, parent, childName, ds) if err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -1019,9 +1020,223 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } defer mnt.EndWrite() - // FIXME(gvisor.dev/issue/1199): Actually implement rename. - _ = newParent - return syserror.EXDEV + oldParent := oldParentVD.Dentry().Impl().(*dentry) + creds := rp.Credentials() + if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + // We need a dentry representing the renamed file since, if it's a + // directory, we need to check for write permission on it. + oldParent.dirMu.Lock() + defer oldParent.dirMu.Unlock() + renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds) + if err != nil { + return err + } + if err := vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&oldParent.mode)), auth.KUID(atomic.LoadUint32(&renamed.uid))); err != nil { + return err + } + if renamed.isDir() { + if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { + return syserror.EINVAL + } + if oldParent != newParent { + if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + } + } else { + if opts.MustBeDir || rp.MustBeDir() { + return syserror.ENOTDIR + } + } + + if oldParent != newParent { + if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + newParent.dirMu.Lock() + defer newParent.dirMu.Unlock() + } + if newParent.vfsd.IsDead() { + return syserror.ENOENT + } + replacedLayer, err := fs.lookupLayerLocked(ctx, newParent, newName) + if err != nil { + return err + } + var ( + replaced *dentry + replacedVFSD *vfs.Dentry + whiteouts map[string]bool + ) + if replacedLayer.existsInOverlay() { + replaced, err = fs.getChildLocked(ctx, newParent, newName, &ds) + if err != nil { + return err + } + replacedVFSD = &replaced.vfsd + if replaced.isDir() { + if !renamed.isDir() { + return syserror.EISDIR + } + if genericIsAncestorDentry(replaced, renamed) { + return syserror.ENOTEMPTY + } + replaced.dirMu.Lock() + defer replaced.dirMu.Unlock() + whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx) + if err != nil { + return err + } + } else { + if rp.MustBeDir() || renamed.isDir() { + return syserror.ENOTDIR + } + } + } + + if oldParent == newParent && oldName == newName { + return nil + } + + // renamed and oldParent need to be copied-up before they're renamed on the + // upper layer. + if err := renamed.copyUpLocked(ctx); err != nil { + return err + } + // If renamed is a directory, all of its descendants need to be copied-up + // before they're renamed on the upper layer. + if renamed.isDir() { + if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil { + return err + } + } + // newParent must be copied-up before it can contain renamed on the upper + // layer. + if err := newParent.copyUpLocked(ctx); err != nil { + return err + } + // If replaced exists, it doesn't need to be copied-up, but we do need to + // serialize with copy-up. Holding renameMu for writing should be + // sufficient, but out of an abundance of caution... + if replaced != nil { + replaced.copyMu.RLock() + defer replaced.copyMu.RUnlock() + } + + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef(ctx) + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { + return err + } + + newpop := vfs.PathOperation{ + Root: newParent.upperVD, + Start: newParent.upperVD, + Path: fspath.Parse(newName), + } + + needRecreateWhiteouts := false + cleanupRecreateWhiteouts := func() { + if !needRecreateWhiteouts { + return + } + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{ + Root: replaced.upperVD, + Start: replaced.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil && err != syserror.EEXIST { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err)) + } + } + } + if renamed.isDir() { + if replacedLayer == lookupLayerUpper { + // Remove whiteouts from the directory being replaced. + needRecreateWhiteouts = true + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{ + Root: replaced.upperVD, + Start: replaced.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } + } else if replacedLayer == lookupLayerUpperWhiteout { + // We need to explicitly remove the whiteout since otherwise rename + // on the upper layer will fail with ENOTDIR. + if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } + } + + // Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a + // regular rename and create the whiteout at the origin manually. Unlike + // RENAME_WHITEOUT, this isn't atomic with respect to other users of the + // upper filesystem, but this is already the case for virtually all other + // overlay filesystem operations too. + oldpop := vfs.PathOperation{ + Root: oldParent.upperVD, + Start: oldParent.upperVD, + Path: fspath.Parse(oldName), + } + if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + + // Below this point, the renamed dentry is now at newpop, and anything we + // replaced is gone forever. Commit the rename, update the overlay + // filesystem tree, and abandon attempts to recover from errors. + vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) + delete(oldParent.children, oldName) + if replaced != nil { + ds = appendDentry(ds, replaced) + } + if oldParent != newParent { + newParent.dirents = nil + // This can't drop the last reference on oldParent because one is held + // by oldParentVD, so lock recursion is impossible. + oldParent.DecRef(ctx) + ds = appendDentry(ds, oldParent) + newParent.IncRef() + renamed.parent = newParent + } + renamed.name = newName + if newParent.children == nil { + newParent.children = make(map[string]*dentry) + } + newParent.children[newName] = renamed + oldParent.dirents = nil + + if err := fs.createWhiteout(ctx, vfsObj, &oldpop); err != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err)) + } + if renamed.isDir() { + if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{ + Name: _OVL_XATTR_OPAQUE, + Value: "y", + }); err != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err)) + } + } + + return nil } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. @@ -1100,7 +1315,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error Start: child.upperVD, Path: fspath.Parse(whiteoutName), }); err != nil && err != syserror.EEXIST { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)) } } } @@ -1130,9 +1345,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // Don't attempt to recover from this: the original directory is // already gone, so any dentries representing it are invalid, and // creating a new directory won't undo that. - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err) - vfsObj.AbortDeleteDentry(&child.vfsd) - return err + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)) } vfsObj.CommitDeleteDentry(ctx, &child.vfsd) @@ -1246,7 +1459,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ }, }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -1339,11 +1552,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } } if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err) - if child != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) - } - return err + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)) } if child != nil { diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index b2efe5f80..d0d26185e 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -111,16 +111,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fsoptsRaw := opts.InternalData fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions) if fsoptsRaw != nil && !haveFSOpts { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) + ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) return nil, nil, syserror.EINVAL } if haveFSOpts { if len(fsopts.LowerRoots) == 0 { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty") + ctx.Infof("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty") return nil, nil, syserror.EINVAL } if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified") + ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified") return nil, nil, syserror.EINVAL } // We don't enforce a maximum number of lower layers when not @@ -137,7 +137,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt delete(mopts, "workdir") upperPath := fspath.Parse(upperPathname) if !upperPath.Absolute { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) + ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) return nil, nil, syserror.EINVAL } upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ @@ -149,13 +149,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt CheckSearchable: true, }) if err != nil { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) return nil, nil, err } defer upperRoot.DecRef(ctx) privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) if err != nil { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) return nil, nil, err } defer privateUpperRoot.DecRef(ctx) @@ -163,24 +163,24 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } lowerPathnamesStr, ok := mopts["lowerdir"] if !ok { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir") + ctx.Infof("overlay.FilesystemType.GetFilesystem: missing required option lowerdir") return nil, nil, syserror.EINVAL } delete(mopts, "lowerdir") lowerPathnames := strings.Split(lowerPathnamesStr, ":") const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified") + ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified") return nil, nil, syserror.EINVAL } if len(lowerPathnames) > maxLowerLayers { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers) + ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers) return nil, nil, syserror.EINVAL } for _, lowerPathname := range lowerPathnames { lowerPath := fspath.Parse(lowerPathname) if !lowerPath.Absolute { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) + ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) return nil, nil, syserror.EINVAL } lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ @@ -192,13 +192,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt CheckSearchable: true, }) if err != nil { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) return nil, nil, err } defer lowerRoot.DecRef(ctx) privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) if err != nil { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) return nil, nil, err } defer privateLowerRoot.DecRef(ctx) @@ -206,7 +206,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } } if len(mopts) != 0 { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) + ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) return nil, nil, syserror.EINVAL } @@ -279,7 +279,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, syserror.EREMOTE } if isWhiteout(&rootStat) { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") + ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") root.destroyLocked(ctx) fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index d57d94dbc..57f026040 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -68,7 +68,7 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, return dentry } -// Lookup implements kernfs.inodeDynamicLookup. +// Lookup implements kernfs.inodeDynamicLookup.Lookup. func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { tid, err := strconv.ParseUint(name, 10, 32) if err != nil { @@ -87,7 +87,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e return subTaskDentry.VFSDentry(), nil } -// IterDirents implements kernfs.inodeDynamicLookup. +// IterDirents implements kernfs.inodeDynamicLookup.IterDirents. func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { tasks := i.task.ThreadGroup().MemberIDs(i.pidns) if len(tasks) == 0 { @@ -155,7 +155,7 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro return fd.GenericDirectoryFD.SetStat(ctx, opts) } -// Open implements kernfs.Inode. +// Open implements kernfs.Inode.Open. func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &subtasksFD{task: i.task} if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ @@ -169,7 +169,7 @@ func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *v return fd.VFSFileDescription(), nil } -// Stat implements kernfs.Inode. +// Stat implements kernfs.Inode.Stat. func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts) if err != nil { @@ -181,12 +181,12 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs return stat, nil } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *subtasksInode) DecRef(context.Context) { i.subtasksInodeRefs.DecRef(i.Destroy) } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index dbdb5d929..e24c8a031 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -106,7 +106,7 @@ func (i *taskInode) Valid(ctx context.Context) bool { return i.task.ExitState() != kernel.TaskExitDead } -// Open implements kernfs.Inode. +// Open implements kernfs.Inode.Open. func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, @@ -117,12 +117,12 @@ func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D return fd.VFSFileDescription(), nil } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *taskInode) DecRef(context.Context) { i.taskInodeRefs.DecRef(i.Destroy) } @@ -168,7 +168,7 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux. return d } -// Stat implements kernfs.Inode. +// Stat implements kernfs.Inode.Stat. func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { stat, err := i.Inode.Stat(ctx, fs, opts) if err != nil { @@ -186,7 +186,7 @@ func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs. return stat, nil } -// CheckPermissions implements kernfs.Inode. +// CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { mode := i.Mode() uid, gid := i.getOwner(mode) diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 47d744a57..c492bcfa7 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -62,7 +62,7 @@ type fdDir struct { produceSymlink bool } -// IterDirents implements kernfs.inodeDynamicLookup. +// IterDirents implements kernfs.inodeDynamicLookup.IterDirents. func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { var fds []int32 i.task.WithMuLocked(func(t *kernel.Task) { @@ -135,7 +135,7 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { return dentry } -// Lookup implements kernfs.inodeDynamicLookup. +// Lookup implements kernfs.inodeDynamicLookup.Lookup. func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { @@ -149,7 +149,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return taskDentry.VFSDentry(), nil } -// Open implements kernfs.Inode. +// Open implements kernfs.Inode.Open. func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, @@ -160,7 +160,7 @@ func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs. return fd.VFSFileDescription(), nil } -// CheckPermissions implements kernfs.Inode. +// CheckPermissions implements kernfs.Inode.CheckPermissions. // // This is to match Linux, which uses a special permission handler to guarantee // that a process can still access /proc/self/fd after it has executed @@ -182,7 +182,7 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia return err } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *fdDirInode) DecRef(context.Context) { i.fdDirInodeRefs.DecRef(i.Destroy) } @@ -269,7 +269,7 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { return dentry } -// Lookup implements kernfs.inodeDynamicLookup. +// Lookup implements kernfs.inodeDynamicLookup.Lookup. func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { @@ -287,7 +287,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, return dentry.VFSDentry(), nil } -// Open implements kernfs.Inode. +// Open implements kernfs.Inode.Open. func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, @@ -298,7 +298,7 @@ func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd * return fd.VFSFileDescription(), nil } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *fdInfoDirInode) DecRef(context.Context) { i.fdInfoDirInodeRefs.DecRef(i.Destroy) } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 87726d8f7..8f7e9b801 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -667,7 +667,7 @@ func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentr return d } -// Readlink implements kernfs.Inode. +// Readlink implements kernfs.Inode.Readlink. func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { if !kernel.ContextCanTrace(ctx, s.task, false) { return "", syserror.EACCES @@ -807,7 +807,7 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri return d } -// Readlink implements Inode. +// Readlink implements kernfs.Inode.Readlink. func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if err := checkTaskState(s.task); err != nil { return "", err @@ -815,7 +815,7 @@ func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string return s.StaticSymlink.Readlink(ctx, mnt) } -// Getlink implements Inode.Getlink. +// Getlink implements kernfs.Inode.Getlink. func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err @@ -852,7 +852,7 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32 i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) } -// Open implements Inode.Open. +// Open implements kernfs.Inode.Open. func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &namespaceFD{inode: i} i.IncRef() @@ -875,20 +875,20 @@ type namespaceFD struct { var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) -// Stat implements FileDescriptionImpl. +// Stat implements vfs.FileDescriptionImpl.Stat. func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() return fd.inode.Stat(ctx, vfs, opts) } -// SetStat implements FileDescriptionImpl. +// SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() creds := auth.CredentialsFromContext(ctx) return fd.inode.SetStat(ctx, vfs, creds, opts) } -// Release implements FileDescriptionImpl. +// Release implements vfs.FileDescriptionImpl.Release. func (fd *namespaceFD) Release(ctx context.Context) { fd.inode.DecRef(ctx) } diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 4e69782c7..1607eac19 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -660,7 +660,7 @@ func sprintSlice(s []uint64) string { return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice. } -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { types := []interface{}{ &inet.StatSNMPIP{}, @@ -709,7 +709,7 @@ type netRouteData struct { var _ dynamicInode = (*netRouteData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT") @@ -773,7 +773,7 @@ type netStatData struct { var _ dynamicInode = (*netStatData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " + diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 3ea00ab87..6d60acc30 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -98,7 +98,7 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace return inode, dentry } -// Lookup implements kernfs.inodeDynamicLookup. +// Lookup implements kernfs.inodeDynamicLookup.Lookup. func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { // Try to lookup a corresponding task. tid, err := strconv.ParseUint(name, 10, 64) @@ -122,7 +122,7 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return taskDentry.VFSDentry(), nil } -// IterDirents implements kernfs.inodeDynamicLookup. +// IterDirents implements kernfs.inodeDynamicLookup.IterDirents. func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256 const FIRST_PROCESS_ENTRY = 256 @@ -200,7 +200,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback return maxTaskID, nil } -// Open implements kernfs.Inode. +// Open implements kernfs.Inode.Open. func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, @@ -229,7 +229,7 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St return stat, nil } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *tasksInode) DecRef(context.Context) { i.tasksInodeRefs.DecRef(i.Destroy) } diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 68c541046..459a8e52e 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -69,7 +69,7 @@ func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualD return vfs.VirtualDentry{}, target, err } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -113,7 +113,7 @@ func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vi return vfs.VirtualDentry{}, target, err } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -125,7 +125,7 @@ type dynamicBytesFileSetAttr struct { kernfs.DynamicBytesFile } -// SetStat implements Inode.SetStat. +// SetStat implements kernfs.Inode.SetStat. func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts) } diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 038a194c7..9e0966efe 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -174,7 +174,7 @@ type tcpSackData struct { var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { if d.enabled == nil { sack, err := d.stack.TCPSACKEnabled() @@ -232,7 +232,7 @@ type tcpRecoveryData struct { var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error { recovery, err := d.stack.TCPRecovery() if err != nil { @@ -284,7 +284,7 @@ type tcpMemData struct { var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error { d.mu.Lock() defer d.mu.Unlock() diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index 6297e1df4..3c02af8c9 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -26,7 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// SignalFileDescription implements FileDescriptionImpl for signal fds. +// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds. type SignalFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -83,7 +83,7 @@ func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) { sfd.mask = mask } -// Read implements FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { // Attempt to dequeue relevant signals. info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0) @@ -132,5 +132,5 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) { sfd.target.SignalUnregister(entry) } -// Release implements FileDescriptionImpl.Release() +// Release implements vfs.FileDescriptionImpl.Release. func (sfd *SignalFileDescription) Release(context.Context) {} diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 94a998568..80b41aa9e 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -30,12 +30,12 @@ import ( // filesystemType implements vfs.FilesystemType. type filesystemType struct{} -// GetFilesystem implements FilesystemType.GetFilesystem. +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("sockfs.filesystemType.GetFilesystem should never be called") } -// Name implements FilesystemType.Name. +// Name implements vfs.FilesystemType.Name. // // Note that registering sockfs is unnecessary, except for the fact that it // will not show up under /proc/filesystems as a result. This is a very minor diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index 86beaa0a8..ac8a4e3bb 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -26,7 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// TimerFileDescription implements FileDescriptionImpl for timer fds. It also +// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also // implements ktime.TimerListener. type TimerFileDescription struct { vfsfd vfs.FileDescription @@ -62,7 +62,7 @@ func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, return &tfd.vfsfd, nil } -// Read implements FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { const sizeofUint64 = 8 if dst.NumBytes() < sizeofUint64 { @@ -128,7 +128,7 @@ func (tfd *TimerFileDescription) ResumeTimer() { tfd.timer.Resume() } -// Release implements FileDescriptionImpl.Release() +// Release implements vfs.FileDescriptionImpl.Release. func (tfd *TimerFileDescription) Release(context.Context) { tfd.timer.Destroy() } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 61d925a73..1362c1602 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -770,7 +770,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return nil } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() defer fs.mu.RUnlock() @@ -865,8 +865,16 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe } if d.parent == nil { if d.name != "" { - // This must be an anonymous memfd file. + // This file must have been created by + // newUnlinkedRegularFileDescription(). In Linux, + // mm/shmem.c:__shmem_file_setup() => + // fs/file_table.c:alloc_file_pseudo() sets the created + // dentry's dentry_operations to anon_ops, for which d_dname == + // simple_dname. fs/d_path.c:simple_dname() defines the + // dentry's pathname to be its name, prefixed with "/" and + // suffixed with " (deleted)". b.PrependComponent("/" + d.name) + b.AppendString(" (deleted)") return vfs.PrependPathSyntheticError{} } return vfs.PrependPathAtNonMountRootError{} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 0710b65db..b8699d064 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -42,6 +42,10 @@ type regularFile struct { // memFile is a platform.File used to allocate pages to this regularFile. memFile *pgalloc.MemoryFile + // memoryUsageKind is the memory accounting category under which pages backing + // this regularFile's contents are accounted. + memoryUsageKind usage.MemoryKind + // mapsMu protects mappings. mapsMu sync.Mutex `state:"nosave"` @@ -86,14 +90,75 @@ type regularFile struct { func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode { file := ®ularFile{ - memFile: fs.memFile, - seals: linux.F_SEAL_SEAL, + memFile: fs.memFile, + memoryUsageKind: usage.Tmpfs, + seals: linux.F_SEAL_SEAL, } file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode) file.inode.nlink = 1 // from parent directory return &file.inode } +// newUnlinkedRegularFileDescription creates a regular file on the tmpfs +// filesystem represented by mount and returns an FD representing that file. +// The new file is not reachable by path traversal from any other file. +// +// newUnlinkedRegularFileDescription is analogous to Linux's +// mm/shmem.c:__shmem_file_setup(). +// +// Preconditions: mount must be a tmpfs mount. +func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) { + fs, ok := mount.Filesystem().Impl().(*filesystem) + if !ok { + panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount") + } + + inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777) + d := fs.newDentry(inode) + defer d.DecRef(ctx) + d.name = name + + fd := ®ularFileFD{} + fd.Init(&inode.locks) + flags := uint32(linux.O_RDWR) + if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return fd, nil +} + +// NewZeroFile creates a new regular file and file description as for +// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is +// initially (implicitly) filled with zeroes. +// +// Preconditions: mount must be a tmpfs mount. +func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) { + // Compare mm/shmem.c:shmem_zero_setup(). + fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero") + if err != nil { + return nil, err + } + rf := fd.inode().impl.(*regularFile) + rf.memoryUsageKind = usage.Anonymous + rf.size = size + return &fd.vfsfd, err +} + +// NewMemfd creates a new regular file and file description as for +// memfd_create. +// +// Preconditions: mount must be a tmpfs mount. +func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { + fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name) + if err != nil { + return nil, err + } + if allowSeals { + fd.inode().impl.(*regularFile).seals = 0 + } + return &fd.vfsfd, nil +} + // truncate grows or shrinks the file to the given size. It returns true if the // file size was updated. func (rf *regularFile) truncate(newSize uint64) (bool, error) { @@ -226,7 +291,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap. optional.End = pgend } - cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { + cerr := rf.data.Fill(ctx, required, optional, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { // Newly-allocated pages are zeroed, so we don't need to do anything. return dsts.NumBytes(), nil }) @@ -575,7 +640,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, case gap.Ok(): // Allocate memory for the write. gapMR := gap.Range().Intersect(pgMR) - fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs) + fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind) if err != nil { retErr = err goto exitLoop diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 4871e55d3..4658e1533 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -746,37 +746,6 @@ func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { return nil } -// NewMemfd creates a new tmpfs regular file and file description that can back -// an anonymous fd created by memfd_create. -func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { - fs, ok := mount.Filesystem().Impl().(*filesystem) - if !ok { - panic("NewMemfd() called with non-tmpfs mount") - } - - // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with - // S_IRWXUGO. - inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777) - rf := inode.impl.(*regularFile) - if allowSeals { - rf.seals = 0 - } - - d := fs.newDentry(inode) - defer d.DecRef(ctx) - d.name = name - - // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with - // FMODE_READ | FMODE_WRITE. - var fd regularFileFD - fd.Init(&inode.locks) - flags := uint32(linux.O_RDWR) - if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err - } - return &fd.vfsfd, nil -} - // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index f86a6a0b2..26b117ca4 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -185,8 +185,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de Start: child.lowerMerkleVD, }, &vfs.GetXattrOptions{ Name: merkleOffsetInParentXattr, - // Offset is a 32 bit integer. - Size: sizeOfInt32, + Size: sizeOfStringInt32, }) // The Merkle tree file for the child should have been created and @@ -227,7 +226,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // the size of all its children's root hashes. dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{ Name: merkleSizeXattr, - Size: sizeOfInt32, + Size: sizeOfStringInt32, }) // The Merkle tree file for the child should have been created and @@ -372,6 +371,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, Path: fspath.Parse(childMerkleFilename), }, &vfs.OpenOptions{ Flags: linux.O_RDWR | linux.O_CREAT, + Mode: 0644, }) if err != nil { return nil, err @@ -807,7 +807,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EROFS } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index 3e0bcd02b..9182df317 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -57,8 +57,9 @@ const merkleOffsetInParentXattr = "user.merkle.offset" // whole file. For a directory, it's the size of all its children's root hashes. const merkleSizeXattr = "user.merkle.size" -// sizeOfInt32 is the size in bytes for a 32 bit integer in extended attributes. -const sizeOfInt32 = 4 +// sizeOfStringInt32 is the size for a 32 bit integer stored as string in +// extended attributes. The maximum value of a 32 bit integer is 10 digits. +const sizeOfStringInt32 = 10 // noCrashOnVerificationFailure indicates whether the sandbox should panic // whenever verification fails. If true, an error is returned instead of @@ -636,7 +637,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of // dataSize is the size of the whole file. dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{ Name: merkleSizeXattr, - Size: sizeOfInt32, + Size: sizeOfStringInt32, }) // The Merkle tree file for the child should have been created and diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 3e85964e4..8c9f11cce 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -242,7 +242,7 @@ type MemoryManager struct { // +stateify savable type vma struct { // mappable is the virtual memory object mapped by this vma. If mappable is - // nil, the vma represents a private anonymous mapping. + // nil, the vma represents an anonymous mapping. mappable memmap.Mappable // off is the offset into mappable at which this vma begins. If mappable is diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go index fdc308542..acac3d357 100644 --- a/pkg/sentry/mm/mm_test.go +++ b/pkg/sentry/mm/mm_test.go @@ -51,7 +51,8 @@ func TestUsageASUpdates(t *testing.T) { defer mm.DecUsers(ctx) addr, err := mm.MMap(ctx, memmap.MMapOpts{ - Length: 2 * usermem.PageSize, + Length: 2 * usermem.PageSize, + Private: true, }) if err != nil { t.Fatalf("MMap got err %v want nil", err) diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index f4c93baeb..2dbe5b751 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -136,9 +136,12 @@ func (m *SpecialMappable) Length() uint64 { // NewSharedAnonMappable returns a SpecialMappable that implements the // semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero. // -// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux -// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should -// do the same to get non-zero device and inode IDs. +// TODO(gvisor.dev/issue/1624): Linux uses an ephemeral file created by +// mm/shmem.c:shmem_zero_setup(), and VFS2 does something analogous. VFS1 uses +// a SpecialMappable instead, incorrectly getting device and inode IDs of zero +// and causing memory for shared anonymous mappings to be allocated up-front +// instead of on first touch; this is to avoid exacerbating the fs.MountSource +// leak (b/143656263). Delete this function along with VFS1. func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) { if length == 0 { return nil, syserror.EINVAL diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 4c9a575e7..a2555ba1a 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -93,18 +92,6 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme } } else { opts.Offset = 0 - if !opts.Private { - if opts.MappingIdentity != nil { - return 0, syserror.EINVAL - } - m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx)) - if err != nil { - return 0, err - } - defer m.DecRef(ctx) - opts.MappingIdentity = m - opts.Mappable = m - } } if opts.Addr.RoundDown() != opts.Addr { diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 3e1735079..871ea80ee 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -146,6 +146,10 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { case stack.FilterTable: table = stack.EmptyFilterTable() case stack.NATTable: + if ipv6 { + nflog("IPv6 redirection not yet supported (gvisor.dev/issue/3549)") + return syserr.ErrInvalidArgument + } table = stack.EmptyNATTable() default: nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go index 0bfd6c1f4..844acfede 100644 --- a/pkg/sentry/socket/netfilter/tcp_matcher.go +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -97,17 +97,33 @@ func (*TCPMatcher) Name() string { // Match implements Matcher.Match. func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { - netHeader := header.IPv4(pkt.NetworkHeader().View()) + // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved + // into the stack.Check codepath as matchers are added. + switch pkt.NetworkProtocolNumber { + case header.IPv4ProtocolNumber: + netHeader := header.IPv4(pkt.NetworkHeader().View()) + if netHeader.TransportProtocol() != header.TCPProtocolNumber { + return false, false + } - if netHeader.TransportProtocol() != header.TCPProtocolNumber { - return false, false - } + // We don't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { + return false, true + } + return false, false + } - // We dont't match fragments. - if frag := netHeader.FragmentOffset(); frag != 0 { - if frag == 1 { - return false, true + case header.IPv6ProtocolNumber: + // As in Linux, we do not perform an IPv6 fragment check. See + // xt_action_param.fragoff in + // include/linux/netfilter/x_tables.h. + if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.TCPProtocolNumber { + return false, false } + + default: + // We don't know the network protocol. return false, false } diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go index 7ed05461d..63201201c 100644 --- a/pkg/sentry/socket/netfilter/udp_matcher.go +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -94,19 +94,33 @@ func (*UDPMatcher) Name() string { // Match implements Matcher.Match. func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { - netHeader := header.IPv4(pkt.NetworkHeader().View()) - // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved // into the stack.Check codepath as matchers are added. - if netHeader.TransportProtocol() != header.UDPProtocolNumber { - return false, false - } + switch pkt.NetworkProtocolNumber { + case header.IPv4ProtocolNumber: + netHeader := header.IPv4(pkt.NetworkHeader().View()) + if netHeader.TransportProtocol() != header.UDPProtocolNumber { + return false, false + } - // We dont't match fragments. - if frag := netHeader.FragmentOffset(); frag != 0 { - if frag == 1 { - return false, true + // We don't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { + return false, true + } + return false, false } + + case header.IPv6ProtocolNumber: + // As in Linux, we do not perform an IPv6 fragment check. See + // xt_action_param.fragoff in + // include/linux/netfilter/x_tables.h. + if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.UDPProtocolNumber { + return false, false + } + + default: + // We don't know the network protocol. return false, false } diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 33edb1b6f..6fede181a 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -158,6 +158,9 @@ var Metrics = tcpip.Stats{ OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."), MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."), MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."), + IPTablesPreroutingDropped: mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Total number of IP packets dropped in the Prerouting chain."), + IPTablesInputDropped: mustCreateMetric("/netstack/ip/iptables/input_dropped", "Total number of IP packets dropped in the Input chain."), + IPTablesOutputDropped: mustCreateMetric("/netstack/ip/iptables/output_dropped", "Total number of IP packets dropped in the Output chain."), }, TCP: tcpip.TCPStats{ ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), @@ -1185,7 +1188,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam var v tcpip.LingerOption var linger linux.Linger if err := ep.GetSockOpt(&v); err != nil { - return &linger, nil + return nil, syserr.TranslateNetstackError(err) } if v.Enabled { diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 08504560c..d6fc03520 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -746,6 +746,9 @@ type baseEndpoint struct { // path is not empty if the endpoint has been bound, // or may be used if the endpoint is connected. path string + + // linger is used for SO_LINGER socket option. + linger tcpip.LingerOption } // EventRegister implements waiter.Waitable.EventRegister. @@ -841,8 +844,14 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess return n, err } -// SetSockOpt sets a socket option. Currently not supported. -func (e *baseEndpoint) SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error { +// SetSockOpt sets a socket option. +func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { + switch v := opt.(type) { + case *tcpip.LingerOption: + e.Lock() + e.linger = *v + e.Unlock() + } return nil } @@ -945,8 +954,11 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { - switch opt.(type) { + switch o := opt.(type) { case *tcpip.LingerOption: + e.Lock() + *o = e.linger + e.Unlock() return nil default: diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 8ab062bca..cd8dfdfa4 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -100,6 +100,15 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if err := file.ConfigureMMap(t, &opts); err != nil { return 0, nil, err } + } else if shared { + // Back shared anonymous mappings with a special mappable. + opts.Offset = 0 + m, err := mm.NewSharedAnonMappable(opts.Length, t.Kernel()) + if err != nil { + return 0, nil, err + } + opts.MappingIdentity = m // transfers ownership of m to opts + opts.Mappable = m } rv, err := t.MemoryManager().MMap(t, opts) diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go index cd6f4dd94..bfcf44b6f 100644 --- a/pkg/sentry/syscalls/linux/sys_sched.go +++ b/pkg/sentry/syscalls/linux/sys_sched.go @@ -30,7 +30,7 @@ const ( // // +marshal type SchedParam struct { - schedPriority int64 + schedPriority int32 } // SchedGetparam implements linux syscall sched_getparam(2). diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go index dc05c2994..9d9dbf775 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mmap.go +++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go @@ -17,6 +17,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/syserror" @@ -85,6 +86,17 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if err := file.ConfigureMMap(t, &opts); err != nil { return 0, nil, err } + } else if shared { + // Back shared anonymous mappings with an anonymous tmpfs file. + opts.Offset = 0 + file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length) + if err != nil { + return 0, nil, err + } + defer file.DecRef(t) + if err := file.ConfigureMMap(t, &opts); err != nil { + return 0, nil, err + } } rv, err := t.MemoryManager().MMap(t, opts) diff --git a/pkg/tcpip/header/parse/BUILD b/pkg/tcpip/header/parse/BUILD new file mode 100644 index 000000000..2adee9288 --- /dev/null +++ b/pkg/tcpip/header/parse/BUILD @@ -0,0 +1,15 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "parse", + srcs = ["parse.go"], + visibility = ["//visibility:public"], + deps = [ + "//pkg/tcpip", + "//pkg/tcpip/buffer", + "//pkg/tcpip/header", + "//pkg/tcpip/stack", + ], +) diff --git a/pkg/tcpip/header/parse/parse.go b/pkg/tcpip/header/parse/parse.go new file mode 100644 index 000000000..522135557 --- /dev/null +++ b/pkg/tcpip/header/parse/parse.go @@ -0,0 +1,166 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package parse provides utilities to parse packets. +package parse + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +// ARP populates pkt's network header with an ARP header found in +// pkt.Data. +// +// Returns true if the header was successfully parsed. +func ARP(pkt *stack.PacketBuffer) bool { + _, ok := pkt.NetworkHeader().Consume(header.ARPSize) + if ok { + pkt.NetworkProtocolNumber = header.ARPProtocolNumber + } + return ok +} + +// IPv4 parses an IPv4 packet found in pkt.Data and populates pkt's network +// header with the IPv4 header. +// +// Returns true if the header was successfully parsed. +func IPv4(pkt *stack.PacketBuffer) bool { + hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize) + if !ok { + return false + } + ipHdr := header.IPv4(hdr) + + // Header may have options, determine the true header length. + headerLen := int(ipHdr.HeaderLength()) + if headerLen < header.IPv4MinimumSize { + // TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in + // order for the packet to be valid. Figure out if we want to reject this + // case. + headerLen = header.IPv4MinimumSize + } + hdr, ok = pkt.NetworkHeader().Consume(headerLen) + if !ok { + return false + } + ipHdr = header.IPv4(hdr) + + pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber + pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr)) + return true +} + +// IPv6 parses an IPv6 packet found in pkt.Data and populates pkt's network +// header with the IPv6 header. +func IPv6(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, fragID uint32, fragOffset uint16, fragMore bool, ok bool) { + hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize) + if !ok { + return 0, 0, 0, false, false + } + ipHdr := header.IPv6(hdr) + + // dataClone consists of: + // - Any IPv6 header bytes after the first 40 (i.e. extensions). + // - The transport header, if present. + // - Any other payload data. + views := [8]buffer.View{} + dataClone := pkt.Data.Clone(views[:]) + dataClone.TrimFront(header.IPv6MinimumSize) + it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone) + + // Iterate over the IPv6 extensions to find their length. + var nextHdr tcpip.TransportProtocolNumber + var extensionsSize int + +traverseExtensions: + for { + extHdr, done, err := it.Next() + if err != nil { + break + } + + // If we exhaust the extension list, the entire packet is the IPv6 header + // and (possibly) extensions. + if done { + extensionsSize = dataClone.Size() + break + } + + switch extHdr := extHdr.(type) { + case header.IPv6FragmentExtHdr: + if fragID == 0 && fragOffset == 0 && !fragMore { + fragID = extHdr.ID() + fragOffset = extHdr.FragmentOffset() + fragMore = extHdr.More() + } + + case header.IPv6RawPayloadHeader: + // We've found the payload after any extensions. + extensionsSize = dataClone.Size() - extHdr.Buf.Size() + nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier) + break traverseExtensions + + default: + // Any other extension is a no-op, keep looping until we find the payload. + } + } + + // Put the IPv6 header with extensions in pkt.NetworkHeader(). + hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize) + if !ok { + panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size())) + } + ipHdr = header.IPv6(hdr) + pkt.Data.CapLength(int(ipHdr.PayloadLength())) + pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber + + return nextHdr, fragID, fragOffset, fragMore, true +} + +// UDP parses a UDP packet found in pkt.Data and populates pkt's transport +// header with the UDP header. +// +// Returns true if the header was successfully parsed. +func UDP(pkt *stack.PacketBuffer) bool { + _, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize) + return ok +} + +// TCP parses a TCP packet found in pkt.Data and populates pkt's transport +// header with the TCP header. +// +// Returns true if the header was successfully parsed. +func TCP(pkt *stack.PacketBuffer) bool { + // TCP header is variable length, peek at it first. + hdrLen := header.TCPMinimumSize + hdr, ok := pkt.Data.PullUp(hdrLen) + if !ok { + return false + } + + // If the header has options, pull those up as well. + if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() { + // TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of + // packets. + hdrLen = offset + } + + _, ok = pkt.TransportHeader().Consume(hdrLen) + return ok +} diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD index 7cbc305e7..4aac12a8c 100644 --- a/pkg/tcpip/link/sniffer/BUILD +++ b/pkg/tcpip/link/sniffer/BUILD @@ -14,6 +14,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/link/nested", "//pkg/tcpip/stack", ], diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go index 4fb127978..560477926 100644 --- a/pkg/tcpip/link/sniffer/sniffer.go +++ b/pkg/tcpip/link/sniffer/sniffer.go @@ -31,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/link/nested" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -195,49 +196,52 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P var transProto uint8 src := tcpip.Address("unknown") dst := tcpip.Address("unknown") - id := 0 - size := uint16(0) + var size uint16 + var id uint32 var fragmentOffset uint16 var moreFragments bool - // Examine the packet using a new VV. Backing storage must not be written. - vv := buffer.NewVectorisedView(pkt.Size(), pkt.Views()) - + // Clone the packet buffer to not modify the original. + // + // We don't clone the original packet buffer so that the new packet buffer + // does not have any of its headers set. + pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views())}) switch protocol { case header.IPv4ProtocolNumber: - hdr, ok := vv.PullUp(header.IPv4MinimumSize) - if !ok { + if ok := parse.IPv4(pkt); !ok { return } - ipv4 := header.IPv4(hdr) + + ipv4 := header.IPv4(pkt.NetworkHeader().View()) fragmentOffset = ipv4.FragmentOffset() moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments src = ipv4.SourceAddress() dst = ipv4.DestinationAddress() transProto = ipv4.Protocol() size = ipv4.TotalLength() - uint16(ipv4.HeaderLength()) - vv.TrimFront(int(ipv4.HeaderLength())) - id = int(ipv4.ID()) + id = uint32(ipv4.ID()) case header.IPv6ProtocolNumber: - hdr, ok := vv.PullUp(header.IPv6MinimumSize) + proto, fragID, fragOffset, fragMore, ok := parse.IPv6(pkt) if !ok { return } - ipv6 := header.IPv6(hdr) + + ipv6 := header.IPv6(pkt.NetworkHeader().View()) src = ipv6.SourceAddress() dst = ipv6.DestinationAddress() - transProto = ipv6.NextHeader() + transProto = uint8(proto) size = ipv6.PayloadLength() - vv.TrimFront(header.IPv6MinimumSize) + id = fragID + moreFragments = fragMore + fragmentOffset = fragOffset case header.ARPProtocolNumber: - hdr, ok := vv.PullUp(header.ARPSize) - if !ok { + if parse.ARP(pkt) { return } - vv.TrimFront(header.ARPSize) - arp := header.ARP(hdr) + + arp := header.ARP(pkt.NetworkHeader().View()) log.Infof( "%s arp %s (%s) -> %s (%s) valid:%t", prefix, @@ -259,7 +263,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P switch tcpip.TransportProtocolNumber(transProto) { case header.ICMPv4ProtocolNumber: transName = "icmp" - hdr, ok := vv.PullUp(header.ICMPv4MinimumSize) + hdr, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize) if !ok { break } @@ -296,7 +300,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P case header.ICMPv6ProtocolNumber: transName = "icmp" - hdr, ok := vv.PullUp(header.ICMPv6MinimumSize) + hdr, ok := pkt.Data.PullUp(header.ICMPv6MinimumSize) if !ok { break } @@ -331,11 +335,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P case header.UDPProtocolNumber: transName = "udp" - hdr, ok := vv.PullUp(header.UDPMinimumSize) - if !ok { + if ok := parse.UDP(pkt); !ok { break } - udp := header.UDP(hdr) + + udp := header.UDP(pkt.TransportHeader().View()) if fragmentOffset == 0 { srcPort = udp.SourcePort() dstPort = udp.DestinationPort() @@ -345,19 +349,19 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P case header.TCPProtocolNumber: transName = "tcp" - hdr, ok := vv.PullUp(header.TCPMinimumSize) - if !ok { + if ok := parse.TCP(pkt); !ok { break } - tcp := header.TCP(hdr) + + tcp := header.TCP(pkt.TransportHeader().View()) if fragmentOffset == 0 { offset := int(tcp.DataOffset()) if offset < header.TCPMinimumSize { details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset) break } - if offset > vv.Size() && !moreFragments { - details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, vv.Size()) + if size := pkt.Data.Size() + len(tcp); offset > size && !moreFragments { + details += fmt.Sprintf("invalid packet: tcp data offset %d larger than tcp packet length %d", offset, size) break } diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD index 82c073e32..b40dde96b 100644 --- a/pkg/tcpip/network/arp/BUILD +++ b/pkg/tcpip/network/arp/BUILD @@ -10,6 +10,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/stack", ], ) diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index 7aaee08c4..cb9225bd7 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -29,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -234,11 +235,7 @@ func (*protocol) Wait() {} // Parse implements stack.NetworkProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) { - _, ok = pkt.NetworkHeader().Consume(header.ARPSize) - if !ok { - return 0, false, false - } - return 0, false, true + return 0, false, parse.ARP(pkt) } // NewProtocol returns an ARP network protocol. diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD index c82593e71..f9c2aa980 100644 --- a/pkg/tcpip/network/ipv4/BUILD +++ b/pkg/tcpip/network/ipv4/BUILD @@ -13,6 +13,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/network/fragmentation", "//pkg/tcpip/network/hash", "//pkg/tcpip/stack", diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index f4394749d..b14b356d6 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/network/fragmentation" "gvisor.dev/gvisor/pkg/tcpip/network/hash" "gvisor.dev/gvisor/pkg/tcpip/stack" @@ -235,14 +236,17 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw ipt := e.stack.IPTables() if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok { // iptables is telling us to drop the packet. + r.Stats().IP.IPTablesOutputDropped.Increment() return nil } - // If the packet is manipulated as per NAT Ouput rules, handle packet - // based on destination address and do not send the packet to link layer. - // TODO(gvisor.dev/issue/170): We should do this for every packet, rather than - // only NATted packets, but removing this check short circuits broadcasts - // before they are sent out to other hosts. + // If the packet is manipulated as per NAT Output rules, handle packet + // based on destination address and do not send the packet to link + // layer. + // + // TODO(gvisor.dev/issue/170): We should do this for every + // packet, rather than only NATted packets, but removing this check + // short circuits broadcasts before they are sent out to other hosts. if pkt.NatDone { netHeader := header.IPv4(pkt.NetworkHeader().View()) ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress()) @@ -297,8 +301,9 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) return n, err } + r.Stats().IP.IPTablesOutputDropped.IncrementBy(uint64(len(dropped))) - // Slow Path as we are dropping some packets in the batch degrade to + // Slow path as we are dropping some packets in the batch degrade to // emitting one packet at a time. n := 0 for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { @@ -318,12 +323,15 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe } if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil { r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) - return n, err + // Dropped packets aren't errors, so include them in + // the return value. + return n + len(dropped), err } n++ } r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) - return n, nil + // Dropped packets aren't errors, so include them in the return value. + return n + len(dropped), nil } // WriteHeaderIncludedPacket writes a packet already containing a network @@ -392,6 +400,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) { ipt := e.stack.IPTables() if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok { // iptables is telling us to drop the packet. + r.Stats().IP.IPTablesInputDropped.Increment() return } @@ -527,37 +536,14 @@ func (*protocol) Close() {} // Wait implements stack.TransportProtocol.Wait. func (*protocol) Wait() {} -// Parse implements stack.TransportProtocol.Parse. +// Parse implements stack.NetworkProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) { - hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize) - if !ok { - return 0, false, false - } - ipHdr := header.IPv4(hdr) - - // Header may have options, determine the true header length. - headerLen := int(ipHdr.HeaderLength()) - if headerLen < header.IPv4MinimumSize { - // TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in - // order for the packet to be valid. Figure out if we want to reject this - // case. - headerLen = header.IPv4MinimumSize - } - hdr, ok = pkt.NetworkHeader().Consume(headerLen) - if !ok { + if ok := parse.IPv4(pkt); !ok { return 0, false, false } - ipHdr = header.IPv4(hdr) - - // If this is a fragment, don't bother parsing the transport header. - parseTransportHeader := true - if ipHdr.More() || ipHdr.FragmentOffset() != 0 { - parseTransportHeader = false - } - pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber - pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr)) - return ipHdr.TransportProtocol(), parseTransportHeader, true + ipHdr := header.IPv4(pkt.NetworkHeader().View()) + return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true } // calculateMTU calculates the network-layer payload MTU based on the link-layer diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go index 5e50558e8..b14bc98e8 100644 --- a/pkg/tcpip/network/ipv4/ipv4_test.go +++ b/pkg/tcpip/network/ipv4/ipv4_test.go @@ -1046,3 +1046,252 @@ func TestReceiveFragments(t *testing.T) { }) } } + +func TestWriteStats(t *testing.T) { + const nPackets = 3 + tests := []struct { + name string + setup func(*testing.T, *stack.Stack) + linkEP func() stack.LinkEndpoint + expectSent int + expectDropped int + expectWritten int + }{ + { + name: "Accept all", + // No setup needed, tables accept everything by default. + setup: func(*testing.T, *stack.Stack) {}, + linkEP: func() stack.LinkEndpoint { return &limitedEP{nPackets} }, + expectSent: nPackets, + expectDropped: 0, + expectWritten: nPackets, + }, { + name: "Accept all with error", + // No setup needed, tables accept everything by default. + setup: func(*testing.T, *stack.Stack) {}, + linkEP: func() stack.LinkEndpoint { return &limitedEP{nPackets - 1} }, + expectSent: nPackets - 1, + expectDropped: 0, + expectWritten: nPackets - 1, + }, { + name: "Drop all", + setup: func(t *testing.T, stk *stack.Stack) { + // Install Output DROP rule. + t.Helper() + ipt := stk.IPTables() + filter, ok := ipt.GetTable(stack.FilterTable, false /* ipv6 */) + if !ok { + t.Fatalf("failed to find filter table") + } + ruleIdx := filter.BuiltinChains[stack.Output] + filter.Rules[ruleIdx].Target = stack.DropTarget{} + if err := ipt.ReplaceTable(stack.FilterTable, filter, false /* ipv6 */); err != nil { + t.Fatalf("failed to replace table: %v", err) + } + }, + linkEP: func() stack.LinkEndpoint { return &limitedEP{nPackets} }, + expectSent: 0, + expectDropped: nPackets, + expectWritten: nPackets, + }, { + name: "Drop some", + setup: func(t *testing.T, stk *stack.Stack) { + // Install Output DROP rule that matches only 1 + // of the 3 packets. + t.Helper() + ipt := stk.IPTables() + filter, ok := ipt.GetTable(stack.FilterTable, false /* ipv6 */) + if !ok { + t.Fatalf("failed to find filter table") + } + // We'll match and DROP the last packet. + ruleIdx := filter.BuiltinChains[stack.Output] + filter.Rules[ruleIdx].Target = stack.DropTarget{} + filter.Rules[ruleIdx].Matchers = []stack.Matcher{&limitedMatcher{nPackets - 1}} + // Make sure the next rule is ACCEPT. + filter.Rules[ruleIdx+1].Target = stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterTable, filter, false /* ipv6 */); err != nil { + t.Fatalf("failed to replace table: %v", err) + } + }, + linkEP: func() stack.LinkEndpoint { return &limitedEP{nPackets} }, + expectSent: nPackets - 1, + expectDropped: 1, + expectWritten: nPackets, + }, + } + + // Parameterize the tests to run with both WritePacket and WritePackets. + writers := []struct { + name string + writePackets func(*stack.Route, stack.PacketBufferList) (int, *tcpip.Error) + }{ + { + name: "WritePacket", + writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) { + nWritten := 0 + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + if err := rt.WritePacket(nil, stack.NetworkHeaderParams{}, pkt); err != nil { + return nWritten, err + } + nWritten++ + } + return nWritten, nil + }, + }, { + name: "WritePackets", + writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) { + return rt.WritePackets(nil, pkts, stack.NetworkHeaderParams{}) + }, + }, + } + + for _, writer := range writers { + t.Run(writer.name, func(t *testing.T) { + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + rt := buildRoute(t, nil, test.linkEP()) + + var pkts stack.PacketBufferList + for i := 0; i < nPackets; i++ { + pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + ReserveHeaderBytes: header.UDPMinimumSize + int(rt.MaxHeaderLength()), + Data: buffer.NewView(0).ToVectorisedView(), + }) + pkt.TransportHeader().Push(header.UDPMinimumSize) + pkts.PushBack(pkt) + } + + test.setup(t, rt.Stack()) + + nWritten, _ := writer.writePackets(&rt, pkts) + + if got := int(rt.Stats().IP.PacketsSent.Value()); got != test.expectSent { + t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent) + } + if got := int(rt.Stats().IP.IPTablesOutputDropped.Value()); got != test.expectDropped { + t.Errorf("dropped %d packets, but expected to drop %d", got, test.expectDropped) + } + if nWritten != test.expectWritten { + t.Errorf("wrote %d packets, but expected WritePackets to return %d", nWritten, test.expectWritten) + } + }) + } + }) + } +} + +func buildRoute(t *testing.T, packetCollectorErrors []*tcpip.Error, linkEP stack.LinkEndpoint) stack.Route { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()}, + }) + s.CreateNIC(1, linkEP) + const ( + src = "\x10\x00\x00\x01" + dst = "\x10\x00\x00\x02" + ) + s.AddAddress(1, ipv4.ProtocolNumber, src) + { + subnet, err := tcpip.NewSubnet(dst, tcpip.AddressMask(header.IPv4Broadcast)) + if err != nil { + t.Fatal(err) + } + s.SetRouteTable([]tcpip.Route{{ + Destination: subnet, + NIC: 1, + }}) + } + rt, err := s.FindRoute(0, src, dst, ipv4.ProtocolNumber, false /* multicastLoop */) + if err != nil { + t.Fatalf("s.FindRoute got %v, want %v", err, nil) + } + return rt +} + +// limitedEP is a link endpoint that writes up to a certain number of packets +// before returning errors. +type limitedEP struct { + limit int +} + +// MTU implements LinkEndpoint.MTU. +func (*limitedEP) MTU() uint32 { + // Give an MTU that won't cause fragmentation for IPv4+UDP. + return header.IPv4MinimumSize + header.UDPMinimumSize +} + +// Capabilities implements LinkEndpoint.Capabilities. +func (*limitedEP) Capabilities() stack.LinkEndpointCapabilities { return 0 } + +// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength. +func (*limitedEP) MaxHeaderLength() uint16 { return 0 } + +// LinkAddress implements LinkEndpoint.LinkAddress. +func (*limitedEP) LinkAddress() tcpip.LinkAddress { return "" } + +// WritePacket implements LinkEndpoint.WritePacket. +func (ep *limitedEP) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error { + if ep.limit == 0 { + return tcpip.ErrInvalidEndpointState + } + ep.limit-- + return nil +} + +// WritePackets implements LinkEndpoint.WritePackets. +func (ep *limitedEP) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + if ep.limit == 0 { + return 0, tcpip.ErrInvalidEndpointState + } + nWritten := ep.limit + if nWritten > pkts.Len() { + nWritten = pkts.Len() + } + ep.limit -= nWritten + return nWritten, nil +} + +// WriteRawPacket implements LinkEndpoint.WriteRawPacket. +func (ep *limitedEP) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error { + if ep.limit == 0 { + return tcpip.ErrInvalidEndpointState + } + ep.limit-- + return nil +} + +// Attach implements LinkEndpoint.Attach. +func (*limitedEP) Attach(_ stack.NetworkDispatcher) {} + +// IsAttached implements LinkEndpoint.IsAttached. +func (*limitedEP) IsAttached() bool { return false } + +// Wait implements LinkEndpoint.Wait. +func (*limitedEP) Wait() {} + +// ARPHardwareType implements LinkEndpoint.ARPHardwareType. +func (*limitedEP) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareEther } + +// AddHeader implements LinkEndpoint.AddHeader. +func (*limitedEP) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) { +} + +// limitedMatcher is an iptables matcher that matches after a certain number of +// packets are checked against it. +type limitedMatcher struct { + limit int +} + +// Name implements Matcher.Name. +func (*limitedMatcher) Name() string { + return "limitedMatcher" +} + +// Match implements Matcher.Match. +func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) { + if lm.limit == 0 { + return true, false + } + lm.limit-- + return false, false +} diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD index bcc64994e..cd5fe3ea8 100644 --- a/pkg/tcpip/network/ipv6/BUILD +++ b/pkg/tcpip/network/ipv6/BUILD @@ -13,6 +13,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/network/fragmentation", "//pkg/tcpip/stack", ], diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index e821a8bff..ee64d92d8 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/network/fragmentation" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -107,6 +108,32 @@ func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params s func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error { e.addIPHeader(r, pkt, params) + // iptables filtering. All packets that reach here are locally + // generated. + nicName := e.stack.FindNICNameFromID(e.NICID()) + ipt := e.stack.IPTables() + if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok { + // iptables is telling us to drop the packet. + r.Stats().IP.IPTablesOutputDropped.Increment() + return nil + } + + // If the packet is manipulated as per NAT Output rules, handle packet + // based on destination address and do not send the packet to link + // layer. + // + // TODO(gvisor.dev/issue/170): We should do this for every + // packet, rather than only NATted packets, but removing this check + // short circuits broadcasts before they are sent out to other hosts. + if pkt.NatDone { + netHeader := header.IPv6(pkt.NetworkHeader().View()) + if ep, err := e.stack.FindNetworkEndpoint(header.IPv6ProtocolNumber, netHeader.DestinationAddress()); err == nil { + route := r.ReverseRoute(netHeader.SourceAddress(), netHeader.DestinationAddress()) + ep.HandlePacket(&route, pkt) + return nil + } + } + if r.Loop&stack.PacketLoop != 0 { loopedR := r.MakeLoopedRoute() @@ -121,8 +148,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw return nil } + if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil { + return err + } r.Stats().IP.PacketsSent.Increment() - return e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt) + return nil } // WritePackets implements stack.LinkEndpoint.WritePackets. @@ -138,9 +168,50 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe e.addIPHeader(r, pb, params) } - n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber) + // iptables filtering. All packets that reach here are locally + // generated. + nicName := e.stack.FindNICNameFromID(e.NICID()) + ipt := e.stack.IPTables() + dropped, natPkts := ipt.CheckPackets(stack.Output, pkts, gso, r, nicName) + if len(dropped) == 0 && len(natPkts) == 0 { + // Fast path: If no packets are to be dropped then we can just invoke the + // faster WritePackets API directly. + n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber) + r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) + return n, err + } + r.Stats().IP.IPTablesOutputDropped.IncrementBy(uint64(len(dropped))) + + // Slow path as we are dropping some packets in the batch degrade to + // emitting one packet at a time. + n := 0 + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + if _, ok := dropped[pkt]; ok { + continue + } + if _, ok := natPkts[pkt]; ok { + netHeader := header.IPv6(pkt.NetworkHeader().View()) + if ep, err := e.stack.FindNetworkEndpoint(header.IPv6ProtocolNumber, netHeader.DestinationAddress()); err == nil { + src := netHeader.SourceAddress() + dst := netHeader.DestinationAddress() + route := r.ReverseRoute(src, dst) + ep.HandlePacket(&route, pkt) + n++ + continue + } + } + if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil { + r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) + // Dropped packets aren't errors, so include them in + // the return value. + return n + len(dropped), err + } + n++ + } + r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) - return n, err + // Dropped packets aren't errors, so include them in the return value. + return n + len(dropped), nil } // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet @@ -169,6 +240,15 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) { it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), vv) hasFragmentHeader := false + // iptables filtering. All packets that reach here are intended for + // this machine and will not be forwarded. + ipt := e.stack.IPTables() + if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok { + // iptables is telling us to drop the packet. + r.Stats().IP.IPTablesInputDropped.Increment() + return + } + for firstHeader := true; ; firstHeader = false { extHdr, done, err := it.Next() if err != nil { @@ -504,75 +584,14 @@ func (*protocol) Close() {} // Wait implements stack.TransportProtocol.Wait. func (*protocol) Wait() {} -// Parse implements stack.TransportProtocol.Parse. +// Parse implements stack.NetworkProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) { - hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize) + proto, _, fragOffset, fragMore, ok := parse.IPv6(pkt) if !ok { return 0, false, false } - ipHdr := header.IPv6(hdr) - - // dataClone consists of: - // - Any IPv6 header bytes after the first 40 (i.e. extensions). - // - The transport header, if present. - // - Any other payload data. - views := [8]buffer.View{} - dataClone := pkt.Data.Clone(views[:]) - dataClone.TrimFront(header.IPv6MinimumSize) - it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone) - - // Iterate over the IPv6 extensions to find their length. - // - // Parsing occurs again in HandlePacket because we don't track the - // extensions in PacketBuffer. Unfortunately, that means HandlePacket - // has to do the parsing work again. - var nextHdr tcpip.TransportProtocolNumber - foundNext := true - extensionsSize := 0 -traverseExtensions: - for extHdr, done, err := it.Next(); ; extHdr, done, err = it.Next() { - if err != nil { - break - } - // If we exhaust the extension list, the entire packet is the IPv6 header - // and (possibly) extensions. - if done { - extensionsSize = dataClone.Size() - foundNext = false - break - } - - switch extHdr := extHdr.(type) { - case header.IPv6FragmentExtHdr: - // If this is an atomic fragment, we don't have to treat it specially. - if !extHdr.More() && extHdr.FragmentOffset() == 0 { - continue - } - // This is a non-atomic fragment and has to be re-assembled before we can - // examine the payload for a transport header. - foundNext = false - - case header.IPv6RawPayloadHeader: - // We've found the payload after any extensions. - extensionsSize = dataClone.Size() - extHdr.Buf.Size() - nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier) - break traverseExtensions - - default: - // Any other extension is a no-op, keep looping until we find the payload. - } - } - - // Put the IPv6 header with extensions in pkt.NetworkHeader(). - hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize) - if !ok { - panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size())) - } - ipHdr = header.IPv6(hdr) - pkt.Data.CapLength(int(ipHdr.PayloadLength())) - pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber - return nextHdr, foundNext, true + return proto, !fragMore && fragOffset == 0, true } // calculateMTU calculates the network-layer payload MTU based on the link-layer diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go index 5f9822c49..9eea1de8d 100644 --- a/pkg/tcpip/network/ipv6/ipv6_test.go +++ b/pkg/tcpip/network/ipv6/ipv6_test.go @@ -1709,3 +1709,250 @@ func TestInvalidIPv6Fragments(t *testing.T) { }) } } + +func TestWriteStats(t *testing.T) { + const nPackets = 3 + tests := []struct { + name string + setup func(*testing.T, *stack.Stack) + linkEP func() stack.LinkEndpoint + expectSent int + expectDropped int + expectWritten int + }{ + { + name: "Accept all", + // No setup needed, tables accept everything by default. + setup: func(*testing.T, *stack.Stack) {}, + linkEP: func() stack.LinkEndpoint { return &limitedEP{nPackets} }, + expectSent: nPackets, + expectDropped: 0, + expectWritten: nPackets, + }, { + name: "Accept all with error", + // No setup needed, tables accept everything by default. + setup: func(*testing.T, *stack.Stack) {}, + linkEP: func() stack.LinkEndpoint { return &limitedEP{nPackets - 1} }, + expectSent: nPackets - 1, + expectDropped: 0, + expectWritten: nPackets - 1, + }, { + name: "Drop all", + setup: func(t *testing.T, stk *stack.Stack) { + // Install Output DROP rule. + t.Helper() + ipt := stk.IPTables() + filter, ok := ipt.GetTable(stack.FilterTable, true /* ipv6 */) + if !ok { + t.Fatalf("failed to find filter table") + } + ruleIdx := filter.BuiltinChains[stack.Output] + filter.Rules[ruleIdx].Target = stack.DropTarget{} + if err := ipt.ReplaceTable(stack.FilterTable, filter, true /* ipv6 */); err != nil { + t.Fatalf("failed to replace table: %v", err) + } + }, + linkEP: func() stack.LinkEndpoint { return &limitedEP{nPackets} }, + expectSent: 0, + expectDropped: nPackets, + expectWritten: nPackets, + }, { + name: "Drop some", + setup: func(t *testing.T, stk *stack.Stack) { + // Install Output DROP rule that matches only 1 + // of the 3 packets. + t.Helper() + ipt := stk.IPTables() + filter, ok := ipt.GetTable(stack.FilterTable, true /* ipv6 */) + if !ok { + t.Fatalf("failed to find filter table") + } + // We'll match and DROP the last packet. + ruleIdx := filter.BuiltinChains[stack.Output] + filter.Rules[ruleIdx].Target = stack.DropTarget{} + filter.Rules[ruleIdx].Matchers = []stack.Matcher{&limitedMatcher{nPackets - 1}} + // Make sure the next rule is ACCEPT. + filter.Rules[ruleIdx+1].Target = stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterTable, filter, true /* ipv6 */); err != nil { + t.Fatalf("failed to replace table: %v", err) + } + }, + linkEP: func() stack.LinkEndpoint { return &limitedEP{nPackets} }, + expectSent: nPackets - 1, + expectDropped: 1, + expectWritten: nPackets, + }, + } + + writers := []struct { + name string + writePackets func(*stack.Route, stack.PacketBufferList) (int, *tcpip.Error) + }{ + { + name: "WritePacket", + writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) { + nWritten := 0 + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + if err := rt.WritePacket(nil, stack.NetworkHeaderParams{}, pkt); err != nil { + return nWritten, err + } + nWritten++ + } + return nWritten, nil + }, + }, { + name: "WritePackets", + writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) { + return rt.WritePackets(nil, pkts, stack.NetworkHeaderParams{}) + }, + }, + } + + for _, writer := range writers { + t.Run(writer.name, func(t *testing.T) { + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + rt := buildRoute(t, nil, test.linkEP()) + + var pkts stack.PacketBufferList + for i := 0; i < nPackets; i++ { + pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + ReserveHeaderBytes: header.UDPMinimumSize + int(rt.MaxHeaderLength()), + Data: buffer.NewView(0).ToVectorisedView(), + }) + pkt.TransportHeader().Push(header.UDPMinimumSize) + pkts.PushBack(pkt) + } + + test.setup(t, rt.Stack()) + + nWritten, _ := writer.writePackets(&rt, pkts) + + if got := int(rt.Stats().IP.PacketsSent.Value()); got != test.expectSent { + t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent) + } + if got := int(rt.Stats().IP.IPTablesOutputDropped.Value()); got != test.expectDropped { + t.Errorf("dropped %d packets, but expected to drop %d", got, test.expectDropped) + } + if nWritten != test.expectWritten { + t.Errorf("wrote %d packets, but expected WritePackets to return %d", nWritten, test.expectWritten) + } + }) + } + }) + } +} + +func buildRoute(t *testing.T, packetCollectorErrors []*tcpip.Error, linkEP stack.LinkEndpoint) stack.Route { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{NewProtocol()}, + }) + s.CreateNIC(1, linkEP) + const ( + src = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" + dst = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" + ) + s.AddAddress(1, ProtocolNumber, src) + { + subnet, err := tcpip.NewSubnet(dst, tcpip.AddressMask("\xfc\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff")) + if err != nil { + t.Fatal(err) + } + s.SetRouteTable([]tcpip.Route{{ + Destination: subnet, + NIC: 1, + }}) + } + rt, err := s.FindRoute(0, src, dst, ProtocolNumber, false /* multicastLoop */) + if err != nil { + t.Fatalf("s.FindRoute got %v, want %v", err, nil) + } + return rt +} + +// limitedEP is a link endpoint that writes up to a certain number of packets +// before returning errors. +type limitedEP struct { + limit int +} + +// MTU implements LinkEndpoint.MTU. +func (*limitedEP) MTU() uint32 { + return header.IPv6MinimumMTU +} + +// Capabilities implements LinkEndpoint.Capabilities. +func (*limitedEP) Capabilities() stack.LinkEndpointCapabilities { return 0 } + +// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength. +func (*limitedEP) MaxHeaderLength() uint16 { return 0 } + +// LinkAddress implements LinkEndpoint.LinkAddress. +func (*limitedEP) LinkAddress() tcpip.LinkAddress { return "" } + +// WritePacket implements LinkEndpoint.WritePacket. +func (ep *limitedEP) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error { + if ep.limit == 0 { + return tcpip.ErrInvalidEndpointState + } + ep.limit-- + return nil +} + +// WritePackets implements LinkEndpoint.WritePackets. +func (ep *limitedEP) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + if ep.limit == 0 { + return 0, tcpip.ErrInvalidEndpointState + } + nWritten := ep.limit + if nWritten > pkts.Len() { + nWritten = pkts.Len() + } + ep.limit -= nWritten + return nWritten, nil +} + +// WriteRawPacket implements LinkEndpoint.WriteRawPacket. +func (ep *limitedEP) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error { + if ep.limit == 0 { + return tcpip.ErrInvalidEndpointState + } + ep.limit-- + return nil +} + +// Attach implements LinkEndpoint.Attach. +func (*limitedEP) Attach(_ stack.NetworkDispatcher) {} + +// IsAttached implements LinkEndpoint.IsAttached. +func (*limitedEP) IsAttached() bool { return false } + +// Wait implements LinkEndpoint.Wait. +func (*limitedEP) Wait() {} + +// ARPHardwareType implements LinkEndpoint.ARPHardwareType. +func (*limitedEP) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareEther } + +// AddHeader implements LinkEndpoint.AddHeader. +func (*limitedEP) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) { +} + +// limitedMatcher is an iptables matcher that matches after a certain number of +// packets are checked against it. +type limitedMatcher struct { + limit int +} + +// Name implements Matcher.Name. +func (*limitedMatcher) Name() string { + return "limitedMatcher" +} + +// Match implements Matcher.Match. +func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) { + if lm.limit == 0 { + return true, false + } + lm.limit-- + return false, false +} diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go index 0e33cbe92..4a521eca9 100644 --- a/pkg/tcpip/stack/iptables.go +++ b/pkg/tcpip/stack/iptables.go @@ -57,7 +57,72 @@ const reaperDelay = 5 * time.Second // all packets. func DefaultTables() *IPTables { return &IPTables{ - tables: [numTables]Table{ + v4Tables: [numTables]Table{ + natID: Table{ + Rules: []Rule{ + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: ErrorTarget{}}, + }, + BuiltinChains: [NumHooks]int{ + Prerouting: 0, + Input: 1, + Forward: HookUnset, + Output: 2, + Postrouting: 3, + }, + Underflows: [NumHooks]int{ + Prerouting: 0, + Input: 1, + Forward: HookUnset, + Output: 2, + Postrouting: 3, + }, + }, + mangleID: Table{ + Rules: []Rule{ + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: ErrorTarget{}}, + }, + BuiltinChains: [NumHooks]int{ + Prerouting: 0, + Output: 1, + }, + Underflows: [NumHooks]int{ + Prerouting: 0, + Input: HookUnset, + Forward: HookUnset, + Output: 1, + Postrouting: HookUnset, + }, + }, + filterID: Table{ + Rules: []Rule{ + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: ErrorTarget{}}, + }, + BuiltinChains: [NumHooks]int{ + Prerouting: HookUnset, + Input: 0, + Forward: 1, + Output: 2, + Postrouting: HookUnset, + }, + Underflows: [NumHooks]int{ + Prerouting: HookUnset, + Input: 0, + Forward: 1, + Output: 2, + Postrouting: HookUnset, + }, + }, + }, + v6Tables: [numTables]Table{ natID: Table{ Rules: []Rule{ Rule{Target: AcceptTarget{}}, @@ -166,25 +231,20 @@ func EmptyNATTable() Table { // GetTable returns a table by name. func (it *IPTables) GetTable(name string, ipv6 bool) (Table, bool) { - // TODO(gvisor.dev/issue/3549): Enable IPv6. - if ipv6 { - return Table{}, false - } id, ok := nameToID[name] if !ok { return Table{}, false } it.mu.RLock() defer it.mu.RUnlock() - return it.tables[id], true + if ipv6 { + return it.v6Tables[id], true + } + return it.v4Tables[id], true } // ReplaceTable replaces or inserts table by name. func (it *IPTables) ReplaceTable(name string, table Table, ipv6 bool) *tcpip.Error { - // TODO(gvisor.dev/issue/3549): Enable IPv6. - if ipv6 { - return tcpip.ErrInvalidOptionValue - } id, ok := nameToID[name] if !ok { return tcpip.ErrInvalidOptionValue @@ -198,7 +258,11 @@ func (it *IPTables) ReplaceTable(name string, table Table, ipv6 bool) *tcpip.Err it.startReaper(reaperDelay) } it.modified = true - it.tables[id] = table + if ipv6 { + it.v6Tables[id] = table + } else { + it.v4Tables[id] = table + } return nil } @@ -221,8 +285,15 @@ const ( // should continue traversing the network stack and false when it should be // dropped. // +// TODO(gvisor.dev/issue/170): PacketBuffer should hold the GSO and route, from +// which address and nicName can be gathered. Currently, address is only +// needed for prerouting and nicName is only needed for output. +// // Precondition: pkt.NetworkHeader is set. -func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, address tcpip.Address, nicName string) bool { +func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) bool { + if pkt.NetworkProtocolNumber != header.IPv4ProtocolNumber && pkt.NetworkProtocolNumber != header.IPv6ProtocolNumber { + return true + } // Many users never configure iptables. Spare them the cost of rule // traversal if rules have never been set. it.mu.RLock() @@ -243,9 +314,14 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr if tableID == natID && pkt.NatDone { continue } - table := it.tables[tableID] + var table Table + if pkt.NetworkProtocolNumber == header.IPv6ProtocolNumber { + table = it.v6Tables[tableID] + } else { + table = it.v4Tables[tableID] + } ruleIdx := table.BuiltinChains[hook] - switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict { + switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict { // If the table returns Accept, move on to the next table. case chainAccept: continue @@ -256,7 +332,7 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr // Any Return from a built-in chain means we have to // call the underflow. underflow := table.Rules[table.Underflows[hook]] - switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, address); v { + switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, preroutingAddr); v { case RuleAccept: continue case RuleDrop: @@ -351,11 +427,11 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r * // Preconditions: // * pkt is a IPv4 packet of at least length header.IPv4MinimumSize. // * pkt.NetworkHeader is not nil. -func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) chainVerdict { +func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) chainVerdict { // Start from ruleIdx and walk the list of rules until a rule gives us // a verdict. for ruleIdx < len(table.Rules) { - switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict { + switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict { case RuleAccept: return chainAccept @@ -372,7 +448,7 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId ruleIdx++ continue } - switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, address, nicName); verdict { + switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, preroutingAddr, nicName); verdict { case chainAccept: return chainAccept case chainDrop: @@ -398,11 +474,11 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId // Preconditions: // * pkt is a IPv4 packet of at least length header.IPv4MinimumSize. // * pkt.NetworkHeader is not nil. -func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) (RuleVerdict, int) { +func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) (RuleVerdict, int) { rule := table.Rules[ruleIdx] // Check whether the packet matches the IP header filter. - if !rule.Filter.match(header.IPv4(pkt.NetworkHeader().View()), hook, nicName) { + if !rule.Filter.match(pkt, hook, nicName) { // Continue on to the next rule. return RuleJump, ruleIdx + 1 } @@ -421,7 +497,7 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx } // All the matchers matched, so run the target. - return rule.Target.Action(pkt, &it.connections, hook, gso, r, address) + return rule.Target.Action(pkt, &it.connections, hook, gso, r, preroutingAddr) } // OriginalDst returns the original destination of redirected connections. It diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go index fbbd2f50f..093ee6881 100644 --- a/pkg/tcpip/stack/iptables_types.go +++ b/pkg/tcpip/stack/iptables_types.go @@ -15,6 +15,7 @@ package stack import ( + "fmt" "strings" "sync" @@ -81,26 +82,25 @@ const ( // // +stateify savable type IPTables struct { - // mu protects tables, priorities, and modified. + // mu protects v4Tables, v6Tables, and modified. mu sync.RWMutex - - // tables maps tableIDs to tables. Holds builtin tables only, not user - // tables. mu must be locked for accessing. - tables [numTables]Table - - // priorities maps each hook to a list of table names. The order of the - // list is the order in which each table should be visited for that - // hook. mu needs to be locked for accessing. - priorities [NumHooks][]tableID - + // v4Tables and v6tables map tableIDs to tables. They hold builtin + // tables only, not user tables. mu must be locked for accessing. + v4Tables [numTables]Table + v6Tables [numTables]Table // modified is whether tables have been modified at least once. It is // used to elide the iptables performance overhead for workloads that // don't utilize iptables. modified bool + // priorities maps each hook to a list of table names. The order of the + // list is the order in which each table should be visited for that + // hook. It is immutable. + priorities [NumHooks][]tableID + connections ConnTrack - // reaperDone can be signalled to stop the reaper goroutine. + // reaperDone can be signaled to stop the reaper goroutine. reaperDone chan struct{} } @@ -148,7 +148,7 @@ type Rule struct { Target Target } -// IPHeaderFilter holds basic IP filtering data common to every rule. +// IPHeaderFilter performs basic IP header matching common to every rule. // // +stateify savable type IPHeaderFilter struct { @@ -196,16 +196,43 @@ type IPHeaderFilter struct { OutputInterfaceInvert bool } -// match returns whether hdr matches the filter. -func (fl IPHeaderFilter) match(hdr header.IPv4, hook Hook, nicName string) bool { - // TODO(gvisor.dev/issue/170): Support other fields of the filter. +// match returns whether pkt matches the filter. +// +// Preconditions: pkt.NetworkHeader is set and is at least of the minimal IPv4 +// or IPv6 header length. +func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, nicName string) bool { + // Extract header fields. + var ( + // TODO(gvisor.dev/issue/170): Support other filter fields. + transProto tcpip.TransportProtocolNumber + dstAddr tcpip.Address + srcAddr tcpip.Address + ) + switch proto := pkt.NetworkProtocolNumber; proto { + case header.IPv4ProtocolNumber: + hdr := header.IPv4(pkt.NetworkHeader().View()) + transProto = hdr.TransportProtocol() + dstAddr = hdr.DestinationAddress() + srcAddr = hdr.SourceAddress() + + case header.IPv6ProtocolNumber: + hdr := header.IPv6(pkt.NetworkHeader().View()) + transProto = hdr.TransportProtocol() + dstAddr = hdr.DestinationAddress() + srcAddr = hdr.SourceAddress() + + default: + panic(fmt.Sprintf("unknown network protocol with EtherType: %d", proto)) + } + // Check the transport protocol. - if fl.Protocol != 0 && fl.Protocol != hdr.TransportProtocol() { + if fl.CheckProtocol && fl.Protocol != transProto { return false } - // Check the source and destination IPs. - if !filterAddress(hdr.DestinationAddress(), fl.DstMask, fl.Dst, fl.DstInvert) || !filterAddress(hdr.SourceAddress(), fl.SrcMask, fl.Src, fl.SrcInvert) { + // Check the addresses. + if !filterAddress(dstAddr, fl.DstMask, fl.Dst, fl.DstInvert) || + !filterAddress(srcAddr, fl.SrcMask, fl.Src, fl.SrcInvert) { return false } diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 1f1a1426b..204bfc433 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -1282,14 +1282,14 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp return } - // TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet. // Loopback traffic skips the prerouting chain. - if protocol == header.IPv4ProtocolNumber && !n.isLoopback() { + if !n.isLoopback() { // iptables filtering. ipt := n.stack.IPTables() address := n.primaryAddress(protocol) if ok := ipt.Check(Prerouting, pkt, nil, nil, address.Address, ""); !ok { // iptables is telling us to drop the packet. + n.stack.stats.IP.IPTablesPreroutingDropped.Increment() return } } diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go index 17b8beebb..1932aaeb7 100644 --- a/pkg/tcpip/stack/packet_buffer.go +++ b/pkg/tcpip/stack/packet_buffer.go @@ -80,7 +80,7 @@ type PacketBuffer struct { // data are held in the same underlying buffer storage. header buffer.Prependable - // NetworkProtocol is only valid when NetworkHeader is set. + // NetworkProtocolNumber is only valid when NetworkHeader is set. // TODO(gvisor.dev/issue/3574): Remove the separately passed protocol // numbers in registration APIs that take a PacketBuffer. NetworkProtocolNumber tcpip.NetworkProtocolNumber diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index b2ddb24ec..464608dee 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -1474,6 +1474,18 @@ type IPStats struct { // MalformedFragmentsReceived is the total number of IP Fragments that were // dropped due to the fragment failing validation checks. MalformedFragmentsReceived *StatCounter + + // IPTablesPreroutingDropped is the total number of IP packets dropped + // in the Prerouting chain. + IPTablesPreroutingDropped *StatCounter + + // IPTablesInputDropped is the total number of IP packets dropped in + // the Input chain. + IPTablesInputDropped *StatCounter + + // IPTablesOutputDropped is the total number of IP packets dropped in + // the Output chain. + IPTablesOutputDropped *StatCounter } // TCPStats collects TCP-specific stats. diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index ad71ff3b6..31116309e 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -74,6 +74,8 @@ type endpoint struct { route stack.Route `state:"manual"` ttl uint8 stats tcpip.TransportEndpointStats `state:"nosave"` + // linger is used for SO_LINGER socket option. + linger tcpip.LingerOption // owner is used to get uid and gid of the packet. owner tcpip.PacketOwner @@ -344,9 +346,14 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { // SetSockOpt sets a socket option. func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { - switch opt.(type) { + switch v := opt.(type) { case *tcpip.SocketDetachFilterOption: return nil + + case *tcpip.LingerOption: + e.mu.Lock() + e.linger = *v + e.mu.Unlock() } return nil } @@ -415,8 +422,17 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption +func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { + switch o := opt.(type) { + case *tcpip.LingerOption: + e.mu.Lock() + *o = e.linger + e.mu.Unlock() + return nil + + default: + return tcpip.ErrUnknownProtocolOption + } } func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpip.PacketOwner) *tcpip.Error { diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 8bd4e5e37..072601d2d 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -83,6 +83,8 @@ type endpoint struct { stats tcpip.TransportEndpointStats `state:"nosave"` bound bool boundNIC tcpip.NICID + // linger is used for SO_LINGER socket option. + linger tcpip.LingerOption // lastErrorMu protects lastError. lastErrorMu sync.Mutex `state:"nosave"` @@ -298,10 +300,16 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { // used with SetSockOpt, and this function always returns // tcpip.ErrNotSupported. func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { - switch opt.(type) { + switch v := opt.(type) { case *tcpip.SocketDetachFilterOption: return nil + case *tcpip.LingerOption: + ep.mu.Lock() + ep.linger = *v + ep.mu.Unlock() + return nil + default: return tcpip.ErrUnknownProtocolOption } @@ -366,8 +374,17 @@ func (ep *endpoint) LastError() *tcpip.Error { } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error { - return tcpip.ErrNotSupported +func (ep *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { + switch o := opt.(type) { + case *tcpip.LingerOption: + ep.mu.Lock() + *o = ep.linger + ep.mu.Unlock() + return nil + + default: + return tcpip.ErrNotSupported + } } // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index fb03e6047..e37c00523 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -84,6 +84,8 @@ type endpoint struct { // Connect(), and is valid only when conneted is true. route stack.Route `state:"manual"` stats tcpip.TransportEndpointStats `state:"nosave"` + // linger is used for SO_LINGER socket option. + linger tcpip.LingerOption // owner is used to get uid and gid of the packet. owner tcpip.PacketOwner @@ -511,10 +513,16 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { // SetSockOpt implements tcpip.Endpoint.SetSockOpt. func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { - switch opt.(type) { + switch v := opt.(type) { case *tcpip.SocketDetachFilterOption: return nil + case *tcpip.LingerOption: + e.mu.Lock() + e.linger = *v + e.mu.Unlock() + return nil + default: return tcpip.ErrUnknownProtocolOption } @@ -577,8 +585,17 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption +func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { + switch o := opt.(type) { + case *tcpip.LingerOption: + e.mu.Lock() + *o = e.linger + e.mu.Unlock() + return nil + + default: + return tcpip.ErrUnknownProtocolOption + } } // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 234fb95ce..4778e7b1c 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -69,6 +69,7 @@ go_library( "//pkg/tcpip/buffer", "//pkg/tcpip/hash/jenkins", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/ports", "//pkg/tcpip/seqnum", "//pkg/tcpip/stack", diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index faea7f2bb..120483838 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1317,14 +1317,17 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) { // indicating the reason why it's not writable. // Caller must hold e.mu and e.sndBufMu func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) { - // The endpoint cannot be written to if it's not connected. - if !e.EndpointState().connected() { - switch e.EndpointState() { - case StateError: - return 0, e.HardError - default: - return 0, tcpip.ErrClosedForSend - } + switch s := e.EndpointState(); { + case s == StateError: + return 0, e.HardError + case !s.connecting() && !s.connected(): + return 0, tcpip.ErrClosedForSend + case s.connecting(): + // As per RFC793, page 56, a send request arriving when in connecting + // state, can be queued to be completed after the state becomes + // connected. Return an error code for the caller of endpoint Write to + // try again, until the connection handshake is complete. + return 0, tcpip.ErrWouldBlock } // Check if the connection has already been closed for sends. diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 63ec12be8..74a17af79 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -29,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/raw" @@ -506,22 +507,7 @@ func (p *protocol) SynRcvdCounter() *synRcvdCounter { // Parse implements stack.TransportProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) bool { - // TCP header is variable length, peek at it first. - hdrLen := header.TCPMinimumSize - hdr, ok := pkt.Data.PullUp(hdrLen) - if !ok { - return false - } - - // If the header has options, pull those up as well. - if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() { - // TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of - // packets. - hdrLen = offset - } - - _, ok = pkt.TransportHeader().Consume(hdrLen) - return ok + return parse.TCP(pkt) } // NewProtocol returns a TCP transport protocol. diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD index b5d2d0ba6..c78549424 100644 --- a/pkg/tcpip/transport/udp/BUILD +++ b/pkg/tcpip/transport/udp/BUILD @@ -32,6 +32,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/ports", "//pkg/tcpip/stack", "//pkg/tcpip/transport/raw", diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index b572c39db..518f636f0 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -154,6 +154,9 @@ type endpoint struct { // owner is used to get uid and gid of the packet. owner tcpip.PacketOwner + + // linger is used for SO_LINGER socket option. + linger tcpip.LingerOption } // +stateify savable @@ -810,6 +813,11 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { case *tcpip.SocketDetachFilterOption: return nil + + case *tcpip.LingerOption: + e.mu.Lock() + e.linger = *v + e.mu.Unlock() } return nil } @@ -966,6 +974,11 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { *o = tcpip.BindToDeviceOption(e.bindToDevice) e.mu.RUnlock() + case *tcpip.LingerOption: + e.mu.RLock() + *o = e.linger + e.mu.RUnlock() + default: return tcpip.ErrUnknownProtocolOption } diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go index 3f87e8057..7d6b91a75 100644 --- a/pkg/tcpip/transport/udp/protocol.go +++ b/pkg/tcpip/transport/udp/protocol.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/raw" "gvisor.dev/gvisor/pkg/waiter" @@ -219,8 +220,7 @@ func (*protocol) Wait() {} // Parse implements stack.TransportProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) bool { - _, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize) - return ok + return parse.UDP(pkt) } // NewProtocol returns a UDP transport protocol. diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go index b7f873392..06fb823f6 100644 --- a/pkg/test/testutil/testutil.go +++ b/pkg/test/testutil/testutil.go @@ -332,13 +332,13 @@ func PollContext(ctx context.Context, cb func() error) error { } // WaitForHTTP tries GET requests on a port until the call succeeds or timeout. -func WaitForHTTP(port int, timeout time.Duration) error { +func WaitForHTTP(ip string, port int, timeout time.Duration) error { cb := func() error { c := &http.Client{ // Calculate timeout to be able to do minimum 5 attempts. Timeout: timeout / 5, } - url := fmt.Sprintf("http://localhost:%d/", port) + url := fmt.Sprintf("http://%s:%d/", ip, port) resp, err := c.Get(url) if err != nil { log.Printf("Waiting %s: %v", url, err) diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go index 9b1799416..24e13565e 100644 --- a/runsc/boot/filter/config_amd64.go +++ b/runsc/boot/filter/config_amd64.go @@ -25,7 +25,6 @@ import ( func init() { allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL], - seccomp.Rule{seccomp.EqualTo(linux.ARCH_GET_FS)}, seccomp.Rule{seccomp.EqualTo(linux.ARCH_SET_FS)}, ) } diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go index 53506b5e1..39f9851a8 100644 --- a/runsc/fsgofer/filter/config_amd64.go +++ b/runsc/fsgofer/filter/config_amd64.go @@ -25,7 +25,6 @@ import ( func init() { allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{ - {seccomp.EqualTo(linux.ARCH_GET_FS)}, {seccomp.EqualTo(linux.ARCH_SET_FS)}, } diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go index 809244bab..8425abecb 100644 --- a/test/e2e/integration_test.go +++ b/test/e2e/integration_test.go @@ -64,9 +64,10 @@ func TestLifeCycle(t *testing.T) { defer d.CleanUp(ctx) // Start the container. + port := 80 if err := d.Create(ctx, dockerutil.RunOpts{ Image: "basic/nginx", - Ports: []int{80}, + Ports: []int{port}, }); err != nil { t.Fatalf("docker create failed: %v", err) } @@ -74,16 +75,15 @@ func TestLifeCycle(t *testing.T) { t.Fatalf("docker start failed: %v", err) } - // Test that container is working. - port, err := d.FindPort(ctx, 80) + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("docker.FindPort(80) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } client := http.Client{Timeout: defaultWait} - if err := httpRequestSucceeds(client, "localhost", port); err != nil { + if err := httpRequestSucceeds(client, ip.String(), port); err != nil { t.Errorf("http request failed: %v", err) } @@ -105,27 +105,28 @@ func TestPauseResume(t *testing.T) { defer d.CleanUp(ctx) // Start the container. + port := 8080 if err := d.Spawn(ctx, dockerutil.RunOpts{ Image: "basic/python", - Ports: []int{8080}, // See Dockerfile. + Ports: []int{port}, // See Dockerfile. }); err != nil { t.Fatalf("docker run failed: %v", err) } - // Find where port 8080 is mapped to. - port, err := d.FindPort(ctx, 8080) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("docker.FindPort(8080) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check that container is working. client := http.Client{Timeout: defaultWait} - if err := httpRequestSucceeds(client, "localhost", port); err != nil { + if err := httpRequestSucceeds(client, ip.String(), port); err != nil { t.Error("http request failed:", err) } @@ -135,7 +136,7 @@ func TestPauseResume(t *testing.T) { // Check if container is paused. client = http.Client{Timeout: 10 * time.Millisecond} // Don't wait a minute. - switch _, err := client.Get(fmt.Sprintf("http://localhost:%d", port)); v := err.(type) { + switch _, err := client.Get(fmt.Sprintf("http://%s:%d", ip.String(), port)); v := err.(type) { case nil: t.Errorf("http req expected to fail but it succeeded") case net.Error: @@ -151,13 +152,13 @@ func TestPauseResume(t *testing.T) { } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check if container is working again. client = http.Client{Timeout: defaultWait} - if err := httpRequestSucceeds(client, "localhost", port); err != nil { + if err := httpRequestSucceeds(client, ip.String(), port); err != nil { t.Error("http request failed:", err) } } @@ -179,9 +180,10 @@ func TestCheckpointRestore(t *testing.T) { defer d.CleanUp(ctx) // Start the container. + port := 8080 if err := d.Spawn(ctx, dockerutil.RunOpts{ Image: "basic/python", - Ports: []int{8080}, // See Dockerfile. + Ports: []int{port}, // See Dockerfile. }); err != nil { t.Fatalf("docker run failed: %v", err) } @@ -199,20 +201,20 @@ func TestCheckpointRestore(t *testing.T) { t.Fatalf("docker restore failed: %v", err) } - // Find where port 8080 is mapped to. - port, err := d.FindPort(ctx, 8080) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("docker.FindPort(8080) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check if container is working again. client := http.Client{Timeout: defaultWait} - if err := httpRequestSucceeds(client, "localhost", port); err != nil { + if err := httpRequestSucceeds(client, ip.String(), port); err != nil { t.Error("http request failed:", err) } } diff --git a/test/image/image_test.go b/test/image/image_test.go index ac6186688..968e62f63 100644 --- a/test/image/image_test.go +++ b/test/image/image_test.go @@ -63,8 +63,8 @@ func TestHelloWorld(t *testing.T) { } } -func runHTTPRequest(port int) error { - url := fmt.Sprintf("http://localhost:%d/not-found", port) +func runHTTPRequest(ip string, port int) error { + url := fmt.Sprintf("http://%s:%d/not-found", ip, port) resp, err := http.Get(url) if err != nil { return fmt.Errorf("error reaching http server: %v", err) @@ -73,7 +73,7 @@ func runHTTPRequest(port int) error { return fmt.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want) } - url = fmt.Sprintf("http://localhost:%d/latin10k.txt", port) + url = fmt.Sprintf("http://%s:%d/latin10k.txt", ip, port) resp, err = http.Get(url) if err != nil { return fmt.Errorf("Error reaching http server: %v", err) @@ -95,13 +95,13 @@ func runHTTPRequest(port int) error { return nil } -func testHTTPServer(t *testing.T, port int) { +func testHTTPServer(t *testing.T, ip string, port int) { const requests = 10 ch := make(chan error, requests) for i := 0; i < requests; i++ { go func() { start := time.Now() - err := runHTTPRequest(port) + err := runHTTPRequest(ip, port) log.Printf("Response time %v: %v", time.Since(start).String(), err) ch <- err }() @@ -110,7 +110,7 @@ func testHTTPServer(t *testing.T, port int) { for i := 0; i < requests; i++ { err := <-ch if err != nil { - t.Errorf("testHTTPServer(%d) failed: %v", port, err) + t.Errorf("testHTTPServer(%s, %d) failed: %v", ip, port, err) } } } @@ -121,27 +121,28 @@ func TestHttpd(t *testing.T) { defer d.CleanUp(ctx) // Start the container. + port := 80 opts := dockerutil.RunOpts{ Image: "basic/httpd", - Ports: []int{80}, + Ports: []int{port}, } d.CopyFiles(&opts, "/usr/local/apache2/htdocs", "test/image/latin10k.txt") if err := d.Spawn(ctx, opts); err != nil { t.Fatalf("docker run failed: %v", err) } - // Find where port 80 is mapped to. - port, err := d.FindPort(ctx, 80) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("FindPort(80) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Errorf("WaitForHTTP() timeout: %v", err) } - testHTTPServer(t, port) + testHTTPServer(t, ip.String(), port) } func TestNginx(t *testing.T) { @@ -150,27 +151,28 @@ func TestNginx(t *testing.T) { defer d.CleanUp(ctx) // Start the container. + port := 80 opts := dockerutil.RunOpts{ Image: "basic/nginx", - Ports: []int{80}, + Ports: []int{port}, } d.CopyFiles(&opts, "/usr/share/nginx/html", "test/image/latin10k.txt") if err := d.Spawn(ctx, opts); err != nil { t.Fatalf("docker run failed: %v", err) } - // Find where port 80 is mapped to. - port, err := d.FindPort(ctx, 80) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("FindPort(80) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Errorf("WaitForHTTP() timeout: %v", err) } - testHTTPServer(t, port) + testHTTPServer(t, ip.String(), port) } func TestMysql(t *testing.T) { @@ -218,26 +220,27 @@ func TestTomcat(t *testing.T) { defer d.CleanUp(ctx) // Start the server. + port := 8080 if err := d.Spawn(ctx, dockerutil.RunOpts{ Image: "basic/tomcat", - Ports: []int{8080}, + Ports: []int{port}, }); err != nil { t.Fatalf("docker run failed: %v", err) } - // Find where port 8080 is mapped to. - port, err := d.FindPort(ctx, 8080) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("FindPort(8080) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Ensure that content is being served. - url := fmt.Sprintf("http://localhost:%d", port) + url := fmt.Sprintf("http://%s:%d", ip.String(), port) resp, err := http.Get(url) if err != nil { t.Errorf("Error reaching http server: %v", err) @@ -253,28 +256,29 @@ func TestRuby(t *testing.T) { defer d.CleanUp(ctx) // Execute the ruby workload. + port := 8080 opts := dockerutil.RunOpts{ Image: "basic/ruby", - Ports: []int{8080}, + Ports: []int{port}, } d.CopyFiles(&opts, "/src", "test/image/ruby.rb", "test/image/ruby.sh") if err := d.Spawn(ctx, opts, "/src/ruby.sh"); err != nil { t.Fatalf("docker run failed: %v", err) } - // Find where port 8080 is mapped to. - port, err := d.FindPort(ctx, 8080) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("FindPort(8080) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running, 'gem install' can take some time. - if err := testutil.WaitForHTTP(port, time.Minute); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, time.Minute); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Ensure that content is being served. - url := fmt.Sprintf("http://localhost:%d", port) + url := fmt.Sprintf("http://%s:%d", ip.String(), port) resp, err := http.Get(url) if err != nil { t.Errorf("error reaching http server: %v", err) diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go index e2beb30d5..398f70ecd 100644 --- a/test/iptables/iptables_test.go +++ b/test/iptables/iptables_test.go @@ -48,6 +48,13 @@ func singleTest(t *testing.T, test TestCase) { } } +// TODO(gvisor.dev/issue/3549): IPv6 NAT support. +func ipv4Test(t *testing.T, test TestCase) { + t.Run("IPv4", func(t *testing.T) { + iptablesTest(t, test, false) + }) +} + func iptablesTest(t *testing.T, test TestCase, ipv6 bool) { if _, ok := Tests[test.Name()]; !ok { t.Fatalf("no test found with name %q. Has it been registered?", test.Name()) @@ -72,11 +79,6 @@ func iptablesTest(t *testing.T, test TestCase, ipv6 bool) { d.CleanUp(context.Background()) }() - // TODO(gvisor.dev/issue/170): Skipping IPv6 gVisor tests. - if ipv6 && dockerutil.Runtime() != "runc" { - t.Skip("gVisor ip6tables not yet implemented") - } - // Create and start the container. opts := dockerutil.RunOpts{ Image: "iptables", @@ -314,75 +316,75 @@ func TestInputInvertDestination(t *testing.T) { singleTest(t, FilterInputInvertDestination{}) } -func TestOutputDestination(t *testing.T) { +func TestFilterOutputDestination(t *testing.T) { singleTest(t, FilterOutputDestination{}) } -func TestOutputInvertDestination(t *testing.T) { +func TestFilterOutputInvertDestination(t *testing.T) { singleTest(t, FilterOutputInvertDestination{}) } func TestNATPreRedirectUDPPort(t *testing.T) { - singleTest(t, NATPreRedirectUDPPort{}) + ipv4Test(t, NATPreRedirectUDPPort{}) } func TestNATPreRedirectTCPPort(t *testing.T) { - singleTest(t, NATPreRedirectTCPPort{}) + ipv4Test(t, NATPreRedirectTCPPort{}) } func TestNATPreRedirectTCPOutgoing(t *testing.T) { - singleTest(t, NATPreRedirectTCPOutgoing{}) + ipv4Test(t, NATPreRedirectTCPOutgoing{}) } func TestNATOutRedirectTCPIncoming(t *testing.T) { - singleTest(t, NATOutRedirectTCPIncoming{}) + ipv4Test(t, NATOutRedirectTCPIncoming{}) } func TestNATOutRedirectUDPPort(t *testing.T) { - singleTest(t, NATOutRedirectUDPPort{}) + ipv4Test(t, NATOutRedirectUDPPort{}) } func TestNATOutRedirectTCPPort(t *testing.T) { - singleTest(t, NATOutRedirectTCPPort{}) + ipv4Test(t, NATOutRedirectTCPPort{}) } func TestNATDropUDP(t *testing.T) { - singleTest(t, NATDropUDP{}) + ipv4Test(t, NATDropUDP{}) } func TestNATAcceptAll(t *testing.T) { - singleTest(t, NATAcceptAll{}) + ipv4Test(t, NATAcceptAll{}) } func TestNATOutRedirectIP(t *testing.T) { - singleTest(t, NATOutRedirectIP{}) + ipv4Test(t, NATOutRedirectIP{}) } func TestNATOutDontRedirectIP(t *testing.T) { - singleTest(t, NATOutDontRedirectIP{}) + ipv4Test(t, NATOutDontRedirectIP{}) } func TestNATOutRedirectInvert(t *testing.T) { - singleTest(t, NATOutRedirectInvert{}) + ipv4Test(t, NATOutRedirectInvert{}) } func TestNATPreRedirectIP(t *testing.T) { - singleTest(t, NATPreRedirectIP{}) + ipv4Test(t, NATPreRedirectIP{}) } func TestNATPreDontRedirectIP(t *testing.T) { - singleTest(t, NATPreDontRedirectIP{}) + ipv4Test(t, NATPreDontRedirectIP{}) } func TestNATPreRedirectInvert(t *testing.T) { - singleTest(t, NATPreRedirectInvert{}) + ipv4Test(t, NATPreRedirectInvert{}) } func TestNATRedirectRequiresProtocol(t *testing.T) { - singleTest(t, NATRedirectRequiresProtocol{}) + ipv4Test(t, NATRedirectRequiresProtocol{}) } func TestNATLoopbackSkipsPrerouting(t *testing.T) { - singleTest(t, NATLoopbackSkipsPrerouting{}) + ipv4Test(t, NATLoopbackSkipsPrerouting{}) } func TestInputSource(t *testing.T) { @@ -419,9 +421,9 @@ func TestFilterAddrs(t *testing.T) { } func TestNATPreOriginalDst(t *testing.T) { - singleTest(t, NATPreOriginalDst{}) + ipv4Test(t, NATPreOriginalDst{}) } func TestNATOutOriginalDst(t *testing.T) { - singleTest(t, NATOutOriginalDst{}) + ipv4Test(t, NATOutOriginalDst{}) } diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD index f850dfcd8..fbfea61e1 100644 --- a/test/packetimpact/tests/BUILD +++ b/test/packetimpact/tests/BUILD @@ -272,6 +272,16 @@ packetimpact_go_test( ) packetimpact_go_test( + name = "tcp_queue_send_in_syn_sent", + srcs = ["tcp_queue_send_in_syn_sent_test.go"], + deps = [ + "//pkg/tcpip/header", + "//test/packetimpact/testbench", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +packetimpact_go_test( name = "icmpv6_param_problem", srcs = ["icmpv6_param_problem_test.go"], # TODO(b/153485026): Fix netstack then remove the line below. diff --git a/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go b/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go new file mode 100644 index 000000000..0ec8fd748 --- /dev/null +++ b/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go @@ -0,0 +1,133 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_queue_send_in_syn_sent_test + +import ( + "context" + "errors" + "flag" + "net" + "sync" + "syscall" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/test/packetimpact/testbench" +) + +func init() { + testbench.RegisterFlags(flag.CommandLine) +} + +// TestQueueSendInSynSent tests send behavior when the TCP state +// is SYN-SENT. +// It tests for 2 variants when in SYN_SENT state and: +// (1) DUT blocks on send and complete handshake +// (2) DUT blocks on send and receive a TCP RST. +func TestQueueSendInSynSent(t *testing.T) { + for _, tt := range []struct { + description string + reset bool + }{ + {description: "Complete handshake", reset: false}, + {description: "Send RST", reset: true}, + } { + t.Run(tt.description, func(t *testing.T) { + dut := testbench.NewDUT(t) + defer dut.TearDown() + + socket, remotePort := dut.CreateBoundSocket(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4)) + conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + defer conn.Close(t) + + sampleData := []byte("Sample Data") + samplePayload := &testbench.Payload{Bytes: sampleData} + dut.SetNonBlocking(t, socket, true) + if _, err := dut.ConnectWithErrno(context.Background(), t, socket, conn.LocalAddr(t)); !errors.Is(err, syscall.EINPROGRESS) { + t.Fatalf("failed to bring DUT to SYN-SENT, got: %s, want EINPROGRESS", err) + } + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}, time.Second); err != nil { + t.Fatalf("expected a SYN from DUT, but got none: %s", err) + } + if _, err := dut.SendWithErrno(context.Background(), t, socket, sampleData, 0); err != syscall.Errno(unix.EWOULDBLOCK) { + t.Fatalf("expected error %s, got %s", syscall.Errno(unix.EWOULDBLOCK), err) + } + + // Test blocking write. + dut.SetNonBlocking(t, socket, false) + + var wg sync.WaitGroup + defer wg.Wait() + wg.Add(1) + var block sync.WaitGroup + block.Add(1) + go func() { + defer wg.Done() + ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) + defer cancel() + + block.Done() + // Issue SEND call in SYN-SENT, this should be queued for + // process until the connection is established. + n, err := dut.SendWithErrno(ctx, t, socket, sampleData, 0) + if tt.reset { + if err != syscall.Errno(unix.ECONNREFUSED) { + t.Errorf("expected error %s, got %s", syscall.Errno(unix.ECONNREFUSED), err) + } + if n != -1 { + t.Errorf("expected return value %d, got %d", -1, n) + } + return + } + if n != int32(len(sampleData)) { + t.Errorf("failed to send on DUT: %s", err) + } + }() + + // Wait for the goroutine to be scheduled and before it + // blocks on endpoint send. + block.Wait() + // The following sleep is used to prevent the connection + // from being established before we are blocked on send. + time.Sleep(100 * time.Millisecond) + + if tt.reset { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}) + return + } + + // Bring the connection to Established. + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}) + + // Expect the data from the DUT's enqueued send request. + // + // On Linux, this can be piggybacked with the ACK completing the + // handshake. On gVisor, getting such a piggyback is a bit more + // complicated because the actual data enqueuing occurs in the + // callers of endpoint Write. + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagPsh | header.TCPFlagAck)}, samplePayload, time.Second); err != nil { + t.Fatalf("expected payload was not received: %s", err) + } + + // Send sample payload and expect an ACK to ensure connection is still ESTABLISHED. + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagPsh | header.TCPFlagAck)}, &testbench.Payload{Bytes: sampleData}) + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil { + t.Fatalf("expected an ACK from DUT, but got none: %s", err) + } + }) + } +} diff --git a/test/runtimes/exclude/php7.3.6.csv b/test/runtimes/exclude/php7.3.6.csv index 815a137c5..a73f3bcfb 100644 --- a/test/runtimes/exclude/php7.3.6.csv +++ b/test/runtimes/exclude/php7.3.6.csv @@ -32,6 +32,7 @@ ext/standard/tests/file/symlink_link_linkinfo_is_link_variation4.phpt,b/16289534 ext/standard/tests/file/symlink_link_linkinfo_is_link_variation8.phpt,b/162896223, ext/standard/tests/general_functions/escapeshellarg_bug71270.phpt,, ext/standard/tests/general_functions/escapeshellcmd_bug71270.phpt,, +ext/standard/tests/streams/proc_open_bug60120.phpt,,Flaky until php-src 3852a35fdbcb ext/standard/tests/streams/proc_open_bug69900.phpt,,Flaky ext/standard/tests/streams/stream_socket_sendto.phpt,, ext/standard/tests/strings/007.phpt,, diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index e5d43cf2e..451feb8f5 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1673,6 +1673,7 @@ cc_binary( gtest, "//test/util:memory_util", "//test/util:posix_error", + "//test/util:proc_util", "//test/util:temp_path", "//test/util:test_util", "//test/util:thread_util", diff --git a/test/syscalls/linux/ip6tables.cc b/test/syscalls/linux/ip6tables.cc index 78e1fa09d..97297ee2b 100644 --- a/test/syscalls/linux/ip6tables.cc +++ b/test/syscalls/linux/ip6tables.cc @@ -87,8 +87,6 @@ TEST(IP6TablesBasic, GetEntriesErrorPrecedence) { // empty when running in native, but we can test that gVisor has the same // initial state that a newly-booted Linux machine would have. TEST(IP6TablesTest, InitialInfo) { - // TODO(gvisor.dev/issue/3549): Enable for ip6tables. - SKIP_IF(true); SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); FileDescriptor sock = @@ -132,8 +130,6 @@ TEST(IP6TablesTest, InitialInfo) { // are empty when running in native, but we can test that gVisor has the same // initial state that a newly-booted Linux machine would have. TEST(IP6TablesTest, InitialEntries) { - // TODO(gvisor.dev/issue/3549): Enable for ip6tables. - SKIP_IF(true); SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); FileDescriptor sock = diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc index f3c1d6bc9..b558e3a01 100644 --- a/test/syscalls/linux/packet_socket_raw.cc +++ b/test/syscalls/linux/packet_socket_raw.cc @@ -643,6 +643,27 @@ TEST_P(RawPacketTest, GetSocketDetachFilter) { SyscallFailsWithErrno(ENOPROTOOPT)); } +TEST_P(RawPacketTest, SetAndGetSocketLinger) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + int level = SOL_SOCKET; + int type = SO_LINGER; + + struct linger sl; + sl.l_onoff = 1; + sl.l_linger = 5; + ASSERT_THAT(setsockopt(s_, level, type, &sl, sizeof(sl)), + SyscallSucceedsWithValue(0)); + + struct linger got_linger = {}; + socklen_t length = sizeof(sl); + ASSERT_THAT(getsockopt(s_, level, type, &got_linger, &length), + SyscallSucceedsWithValue(0)); + + ASSERT_EQ(length, sizeof(got_linger)); + EXPECT_EQ(0, memcmp(&sl, &got_linger, length)); +} + INSTANTIATE_TEST_SUITE_P(AllInetTests, RawPacketTest, ::testing::Values(ETH_P_IP, ETH_P_ALL)); diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index 1ba7bf4f5..c1488b06b 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -63,6 +63,7 @@ #include "test/util/fs_util.h" #include "test/util/memory_util.h" #include "test/util/posix_error.h" +#include "test/util/proc_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -672,6 +673,23 @@ TEST(ProcSelfMaps, Mprotect) { 3 * kPageSize, PROT_READ))); } +TEST(ProcSelfMaps, SharedAnon) { + const Mapping m = ASSERT_NO_ERRNO_AND_VALUE( + MmapAnon(kPageSize, PROT_READ, MAP_SHARED | MAP_ANONYMOUS)); + + const auto proc_self_maps = + ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps")); + for (const auto& line : absl::StrSplit(proc_self_maps, '\n')) { + const auto entry = ASSERT_NO_ERRNO_AND_VALUE(ParseProcMapsLine(line)); + if (entry.start <= m.addr() && m.addr() < entry.end) { + // cf. proc(5), "/proc/[pid]/map_files/" + EXPECT_EQ(entry.filename, "/dev/zero (deleted)"); + return; + } + } + FAIL() << "no maps entry containing mapping at " << m.ptr(); +} + TEST(ProcSelfFd, OpenFd) { int pipe_fds[2]; ASSERT_THAT(pipe2(pipe_fds, O_CLOEXEC), SyscallSucceeds()); diff --git a/test/syscalls/linux/raw_socket_icmp.cc b/test/syscalls/linux/raw_socket_icmp.cc index 3de898df7..1b9dbc584 100644 --- a/test/syscalls/linux/raw_socket_icmp.cc +++ b/test/syscalls/linux/raw_socket_icmp.cc @@ -416,6 +416,28 @@ TEST_F(RawSocketICMPTest, BindConnectSendAndReceive) { ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp)); } +// Set and get SO_LINGER. +TEST_F(RawSocketICMPTest, SetAndGetSocketLinger) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + int level = SOL_SOCKET; + int type = SO_LINGER; + + struct linger sl; + sl.l_onoff = 1; + sl.l_linger = 5; + ASSERT_THAT(setsockopt(s_, level, type, &sl, sizeof(sl)), + SyscallSucceedsWithValue(0)); + + struct linger got_linger = {}; + socklen_t length = sizeof(sl); + ASSERT_THAT(getsockopt(s_, level, type, &got_linger, &length), + SyscallSucceedsWithValue(0)); + + ASSERT_EQ(length, sizeof(got_linger)); + EXPECT_EQ(0, memcmp(&sl, &got_linger, length)); +} + void RawSocketICMPTest::ExpectICMPSuccess(const struct icmphdr& icmp) { // We're going to receive both the echo request and reply, but the order is // indeterminate. diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc index 833c0dc4f..5458f54ad 100644 --- a/test/syscalls/linux/rename.cc +++ b/test/syscalls/linux/rename.cc @@ -170,6 +170,9 @@ TEST(RenameTest, FileOverwritesFile) { } TEST(RenameTest, DirectoryOverwritesDirectoryLinkCount) { + // Directory link counts are synthetic on overlay filesystems. + SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(IsOverlayfs(GetAbsoluteTestTmpdir()))); + auto parent1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); EXPECT_THAT(Links(parent1.path()), IsPosixErrorOkAndHolds(2)); diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc index 6e4ecd680..3f2c0fdf2 100644 --- a/test/syscalls/linux/socket_ip_udp_generic.cc +++ b/test/syscalls/linux/socket_ip_udp_generic.cc @@ -451,7 +451,7 @@ TEST_P(UDPSocketPairTest, TClassRecvMismatch) { } // Test the SO_LINGER option can be set/get on udp socket. -TEST_P(UDPSocketPairTest, SoLingerFail) { +TEST_P(UDPSocketPairTest, SetAndGetSocketLinger) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); int level = SOL_SOCKET; int type = SO_LINGER; @@ -469,15 +469,7 @@ TEST_P(UDPSocketPairTest, SoLingerFail) { SyscallSucceedsWithValue(0)); ASSERT_EQ(length, sizeof(got_linger)); - // Linux returns the values which are set in the SetSockOpt for SO_LINGER. - // In gVisor, we do not store the linger values for UDP as SO_LINGER for UDP - // is a no-op. - if (IsRunningOnGvisor()) { - struct linger want_linger = {}; - EXPECT_EQ(0, memcmp(&want_linger, &got_linger, length)); - } else { - EXPECT_EQ(0, memcmp(&sl, &got_linger, length)); - } + EXPECT_EQ(0, memcmp(&sl, &got_linger, length)); } } // namespace testing diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc index 99e77b89e..1edcb15a7 100644 --- a/test/syscalls/linux/socket_unix_stream.cc +++ b/test/syscalls/linux/socket_unix_stream.cc @@ -103,6 +103,24 @@ TEST_P(StreamUnixSocketPairTest, Sendto) { SyscallFailsWithErrno(EISCONN)); } +TEST_P(StreamUnixSocketPairTest, SetAndGetSocketLinger) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + struct linger sl = {1, 5}; + EXPECT_THAT( + setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)), + SyscallSucceedsWithValue(0)); + + struct linger got_linger = {}; + socklen_t length = sizeof(sl); + EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, + &got_linger, &length), + SyscallSucceedsWithValue(0)); + + ASSERT_EQ(length, sizeof(got_linger)); + EXPECT_EQ(0, memcmp(&got_linger, &sl, length)); +} + INSTANTIATE_TEST_SUITE_P( AllUnixDomainSockets, StreamUnixSocketPairTest, ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>( diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc index 1b2941c37..92260b1e1 100644 --- a/test/syscalls/linux/stat.cc +++ b/test/syscalls/linux/stat.cc @@ -333,20 +333,23 @@ TEST_F(StatTest, LeadingDoubleSlash) { // Test that a rename doesn't change the underlying file. TEST_F(StatTest, StatDoesntChangeAfterRename) { - const TempPath old_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath old_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); const TempPath new_path(NewTempAbsPath()); struct stat st_old = {}; struct stat st_new = {}; - ASSERT_THAT(stat(old_dir.path().c_str(), &st_old), SyscallSucceeds()); - ASSERT_THAT(rename(old_dir.path().c_str(), new_path.path().c_str()), + ASSERT_THAT(stat(old_file.path().c_str(), &st_old), SyscallSucceeds()); + ASSERT_THAT(rename(old_file.path().c_str(), new_path.path().c_str()), SyscallSucceeds()); ASSERT_THAT(stat(new_path.path().c_str(), &st_new), SyscallSucceeds()); EXPECT_EQ(st_old.st_nlink, st_new.st_nlink); EXPECT_EQ(st_old.st_dev, st_new.st_dev); - EXPECT_EQ(st_old.st_ino, st_new.st_ino); + // Overlay filesystems may synthesize directory inode numbers on the fly. + if (!ASSERT_NO_ERRNO_AND_VALUE(IsOverlayfs(GetAbsoluteTestTmpdir()))) { + EXPECT_EQ(st_old.st_ino, st_new.st_ino); + } EXPECT_EQ(st_old.st_mode, st_new.st_mode); EXPECT_EQ(st_old.st_uid, st_new.st_uid); EXPECT_EQ(st_old.st_gid, st_new.st_gid); @@ -383,7 +386,9 @@ TEST_F(StatTest, LinkCountsWithRegularFileChild) { // This test verifies that inodes remain around when there is an open fd // after link count hits 0. -TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) { +// +// It is marked NoSave because we don't support saving unlinked files. +TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoSave) { // Setting the enviornment variable GVISOR_GOFER_UNCACHED to any value // will prevent this test from running, see the tmpfs lifecycle. // @@ -392,9 +397,6 @@ TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) { const char* uncached_gofer = getenv("GVISOR_GOFER_UNCACHED"); SKIP_IF(uncached_gofer != nullptr); - // We don't support saving unlinked files. - const DisableSave ds; - const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const TempPath child = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( dir.path(), "hello", TempPath::kDefaultFileMode)); diff --git a/website/_config.yml b/website/_config.yml index b08602970..20fbb3d2d 100644 --- a/website/_config.yml +++ b/website/_config.yml @@ -34,3 +34,6 @@ authors: igudger: name: Ian Gudger email: igudger@google.com + fvoznika: + name: Fabricio Voznika + email: fvoznika@google.com diff --git a/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png b/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png Binary files differnew file mode 100644 index 000000000..c750f0851 --- /dev/null +++ b/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png diff --git a/website/blog/2019-11-18-security-basics.md b/website/blog/2019-11-18-security-basics.md index 2256ee9d5..b6cf57a77 100644 --- a/website/blog/2019-11-18-security-basics.md +++ b/website/blog/2019-11-18-security-basics.md @@ -279,8 +279,10 @@ weaknesses of each gVisor component. We will also use it to introduce Google's Vulnerability Reward Program[^14], and other ways the community can contribute to help make gVisor safe, fast and stable. +<br> +<br> -## Notes +-------------------------------------------------------------------------------- [^1]: [https://en.wikipedia.org/wiki/Secure_by_design](https://en.wikipedia.org/wiki/Secure_by_design) [^2]: [https://gvisor.dev/docs/architecture_guide](https://gvisor.dev/docs/architecture_guide/) diff --git a/website/blog/2020-09-18-containing-a-real-vulnerability.md b/website/blog/2020-09-18-containing-a-real-vulnerability.md new file mode 100644 index 000000000..b71ef63d9 --- /dev/null +++ b/website/blog/2020-09-18-containing-a-real-vulnerability.md @@ -0,0 +1,224 @@ +# Containing a Real Vulnerability + +In the previous two posts we talked about gVisor's +[security design principles](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/) +as well as how those are applied in the +[context of networking](https://gvisor.dev/blog/2020/04/02/gvisor-networking-security/). +Recently, a new container escape vulnerability +([CVE-2020-14386](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-14386)) +was announced that ties these topics well together. gVisor is +[not vulnerable](https://seclists.org/oss-sec/2020/q3/168) to this specific +issue, but it provides an interesting case study to continue our exploration of +gVisor's security. While gVisor is not immune to vulnerabilities, +[we take several steps](https://gvisor.dev/security/) to minimize the impact and +remediate if a vulnerability is found. + +## Escaping the Container + +First, let’s describe how the discovered vulnerability works. There are numerous +ways one can send and receive bytes over the network with Linux. One of the most +performant ways is to use a ring buffer, which is a memory region shared by the +application and the kernel. These rings are created by calling +[setsockopt(2)](https://man7.org/linux/man-pages/man2/setsockopt.2.html) with +[`PACKET_RX_RING`](https://man7.org/linux/man-pages/man7/packet.7.html) for +receiving and +[`PACKET_TX_RING`](https://man7.org/linux/man-pages/man7/packet.7.html) for +sending packets. + +The vulnerability is in the code that reads packets when `PACKET_RX_RING` is +enabled. There is another option +([`PACKET_RESERVE`](https://man7.org/linux/man-pages/man7/packet.7.html)) that +asks the kernel to leave some space in the ring buffer before each packet for +anything the application needs, e.g. control structures. When a packet is +received, the kernel calculates where to copy the packet to, taking the amount +reserved before each packet into consideration. If the amount reserved is large, +the kernel performed an incorrect calculation which could cause an overflow +leading to an out-of-bounds write of up to 10 bytes, controlled by the attacker. +The data in the write is easily controlled using the loopback to send a crafted +packet and receiving it using a `PACKET_RX_RING` with a carefully selected +`PACKET_RESERVE` size. + +```c +static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ +// ... + if (sk->sk_type == SOCK_DGRAM) { + macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + + po->tp_reserve; + } else { + unsigned int maclen = skb_network_offset(skb); + // tp_reserve is unsigned int, netoff is unsigned short. Addition can overflow netoff + netoff = TPACKET_ALIGN(po->tp_hdrlen + + (maclen < 16 ? 16 : maclen)) + + po->tp_reserve; + if (po->has_vnet_hdr) { + netoff += sizeof(struct virtio_net_hdr); + do_vnet = true; + } + // Attacker controls netoff and can make macoff be smaller than sizeof(struct virtio_net_hdr) + macoff = netoff - maclen; + } +// ... + // "macoff - sizeof(struct virtio_net_hdr)" can be negative, resulting in a pointer before h.raw + if (do_vnet && + virtio_net_hdr_from_skb(skb, h.raw + macoff - + sizeof(struct virtio_net_hdr), + vio_le(), true, 0)) { +// ... +``` + +The [`CAP_NET_RAW`](https://man7.org/linux/man-pages/man7/capabilities.7.html) +capability is required to create the socket above. However, in order to support +common debugging tools like `ping` and `tcpdump`, Docker containers, including +those created for Kubernetes, are given `CAP_NET_RAW` by default and thus may be +able to trigger this vulnerability to elevate privileges and escape the +container. + +Next, we are going to explore why this vulnerability doesn’t work in gVisor, and +how gVisor could prevent the escape even if a similar vulnerability existed +inside gVisor’s kernel. + +## Default Protections + +gVisor does not implement `PACKET_RX_RING`, but **does** support raw sockets +which are required for `PACKET_RX_RING`. Raw sockets are a controversial feature +to support in a sandbox environment. While it allows great customizations for +essential tools like `ping`, it may allow packets to be written to the network +without any validation. In general, allowing an untrusted application to write +crafted packets to the network is a questionable idea and a historical source of +vulnerabilities. With that in mind, if `CAP_NET_RAW` is enabled by default, it +would not be _secure by default_ to run untrusted applications. + +After multiple discussions when raw sockets were first implemented, we decided +to disable raw sockets by default, **even if `CAP_NET_RAW` is given to the +application**. Instead, enabling raw sockets in gVisor requires the admin to set +`--net-raw` flag to runsc when configuring the runtime, in addition to requiring +the `CAP_NET_RAW` capability in the application. It comes at the expense that +some tools may not work out of the box, but as part of our +[secure-by-default](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/#secure-by-default) +principle, we felt that it was important for the “less secure” configuration to +be explicit. + +Since this bug was due to an overflow in the specific Linux implementation of +the packet ring, gVisor's raw socket implementation is not affected. However, if +there were a vulnerability in gVisor, containers would not be allowed to exploit +it by default. + +As an alternative way to implement this same constraint, Kubernetes allows +[admission controllers](https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/) +to be configured to customize requests. Cloud providers can use this to +implement more stringent policies. For example, GKE implements an admission +controller for gVisor that +[removes `CAP_NET_RAW` from gVisor pods](https://cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods#capabilities) +unless it has been explicitly set in the pod spec. + +## Isolated Kernel + +gVisor has its own application kernel, called the Sentry, that is distinct from +the host kernel. Just like what you would expect from a kernel, gVisor has a +memory management subsystem, virtual file system, and a full network stack. The +host network is only used as a transport to carry packets in and out the +sandbox[^1]. The loopback interface which is used in the exploit stays +completely inside the sandbox, never reaching the host. + +Therefore, even if the Sentry was vulnerable to the attack, there would be two +factors that would prevent a container escape from happening. First, the +vulnerability would be limited to the Sentry, and the attacker would compromise +only the application kernel, bound by a restricted set of +[seccomp](https://en.wikipedia.org/wiki/Seccomp) filters, discussed more in +depth below. Second, the Sentry is a distinct implementation of the API, written +in Go, which provides bounds checking that would have likely prevented access +past the bounds of the shared region (e.g. see +[aio](https://cs.opensource.google/gvisor/gvisor/+/master:pkg/sentry/syscalls/linux/vfs2/aio.go;l=210;drc=a11061d78a58ed75b10606d1a770b035ed944b66?q=file:aio&ss=gvisor%2Fgvisor) +or +[kcov](https://cs.opensource.google/gvisor/gvisor/+/master:pkg/sentry/kernel/kcov.go;l=272?q=file:kcov&ss=gvisor%2Fgvisor), +which have similar shared regions). + +Here, Kubernetes warrants slightly more explanation. gVisor makes pods the unit +of isolation and a pod can run multiple containers. In other words, each pod is +a gVisor instance, and each container is a set of processes running inside +gVisor, isolated via Sentry-internal namespaces like regular containers inside a +pod. If there were a vulnerability in gVisor, the privilege escalation would +allow a container inside the pod to break out to other **containers inside the +same pod**, but the container still **cannot break out of the pod**. + +## Defense in Depth + +gVisor follows a +[common security principle used at Google](https://cloud.google.com/security/infrastructure/design/resources/google_infrastructure_whitepaper_fa.pdf) +that the system should have two layers of protection, and those layers should +require different compromises to be broken. We apply this principle by assuming +that the Sentry (first layer of defense) +[will be compromised and should not be trusted](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/#defense-in-depth). +In order to protect the host kernel from a compromised Sentry, we wrap it around +many security and isolations features to ensure only the minimal set of +functionality from the host kernel is exposed. + +![Figure 1](/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png "Protection layers.") + +First, the sandbox runs inside a cgroup that can limit and throttle host +resources being used. Second, the sandbox joins empty namespaces, including user +and mount, to further isolate from the host. Next, it changes the process root +to a read-only directory that contains only `/proc` and nothing else. Then, it +executes with the unprivileged user/group +[`nobody`](https://en.wikipedia.org/wiki/Nobody_\(username\)) with all +capabilities stripped. Last and most importantly, a seccomp filter is added to +tightly restrict what parts of the Linux syscall surface that gVisor is allowed +to access. The allowed host surface is a far smaller set of syscalls than the +Sentry implements for applications to use. Not only restricting the syscall +being called, but also checking that arguments to these syscalls are within the +expected set. Dangerous syscalls like <code>execve(2)</code>, +<code>open(2)</code>, and <code>socket(2)</code> are prohibited, thus an +attacker isn’t able to execute binaries or acquire new resources on the host. + +if there were a vulnerability in gVisor that allowed an attacker to execute code +inside the Sentry, the attacker still has extremely limited privileges on the +host. In fact, a compromised Sentry is much more restricted than a +non-compromised regular container. For CVE-2020-14386 in particular, the attack +would be blocked by more than one security layer: non-privileged user, no +capability, and seccomp filters. + +Although the surface is drastically reduced, there is still a chance that there +is a vulnerability in one of the allowed syscalls. That’s why it’s important to +keep the surface small and carefully consider what syscalls are allowed. You can +find the full set of allowed syscalls +[here](https://cs.opensource.google/gvisor/gvisor/+/master:runsc/boot/filter/). + +Another possible attack vector is resources that are present in the Sentry, like +open file descriptors. The Sentry has file descriptors that an attacker could +potentially use, such as log files, platform files (e.g. `/dev/kvm`), an RPC +endpoint that allows external communication with the Sentry, and a Netstack +endpoint that connects the sandbox to the network. The Netstack endpoint in +particular is a concern because it gives direct access to the network. It’s an +`AF_PACKET` socket that allows arbitrary L2 packets to be written to the +network. In the normal case, Netstack assembles packets that go out the network, +giving the container control over only the payload. But if the Sentry is +compromised, an attacker can craft packets to the network. In many ways this is +similar to anyone sending random packets over the internet, but still this is a +place where the host kernel surface exposed is larger than we would like it to +be. + +## Conclusion + +Security comes with many tradeoffs that are often hard to make, such as the +decision to disable raw sockets by default. However, these tradeoffs have served +us well, and we've found them to have paid off over time. CVE-2020-14386 offers +great insight into how multiple layers of protection can be effective against +such an attack. + +We cannot guarantee that a container escape will never happen in gVisor, but we +do our best to make it as hard as we possibly can. + +If you have not tried gVisor yet, it’s easier than you think. Just follow the +steps in the +[Quick Start](https://gvisor.dev/docs/user_guide/quick_start/docker/) guide. +<br> +<br> + +-------------------------------------------------------------------------------- + +[^1]: Those packets are eventually handled by the host, as it needs to route + them to local containers or send them out the NIC. The packet will be + handled by many switches, routers, proxies, servers, etc. along the way, + which may be subject to their own vulnerabilities. diff --git a/website/blog/BUILD b/website/blog/BUILD index 01c1f5a6e..865e403da 100644 --- a/website/blog/BUILD +++ b/website/blog/BUILD @@ -28,6 +28,16 @@ doc( permalink = "/blog/2020/04/02/gvisor-networking-security/", ) +doc( + name = "containing_a_real_vulnerability", + src = "2020-09-18-containing-a-real-vulnerability.md", + authors = [ + "fvoznika", + ], + layout = "post", + permalink = "/blog/2020/09/18/containing-a-real-vulnerability/", +) + docs( name = "posts", deps = [ diff --git a/website/index.md b/website/index.md index 84f877d49..c6cd477c2 100644 --- a/website/index.md +++ b/website/index.md @@ -5,7 +5,7 @@ <div class="col-md-6"> <p>gVisor is an <b>application kernel</b> for <b>containers</b> that provides efficient defense-in-depth anywhere.</p> <p style="margin-top: 20px;"> - <a class="btn" href="/docs/user_guide/quick_start/docker/">Quick start <i class="fas fa-arrow-alt-circle-right ml-2"></i></a> + <a class="btn" href="/docs/user_guide/install/">Get started <i class="fas fa-arrow-alt-circle-right ml-2"></i></a> <a class="btn" href="/docs/">Learn More <i class="fas fa-arrow-alt-circle-right ml-2"></i></a> </p> </div> |