summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--.github/workflows/go.yml15
-rw-r--r--Makefile29
-rw-r--r--README.md2
-rw-r--r--g3doc/user_guide/tutorials/docker.md2
-rw-r--r--images/jekyll/Dockerfile3
-rw-r--r--pkg/abi/linux/ip.go10
-rw-r--r--pkg/buffer/safemem.go82
-rw-r--r--pkg/cleanup/BUILD17
-rw-r--r--pkg/cleanup/cleanup.go60
-rw-r--r--pkg/cleanup/cleanup_test.go66
-rw-r--r--pkg/sentry/fs/g3doc/.gitignore1
-rw-r--r--pkg/sentry/fs/g3doc/fuse.md262
-rw-r--r--pkg/sentry/fsimpl/ext/dentry.go12
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go60
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go14
-rw-r--r--pkg/sentry/fsimpl/tmpfs/BUILD1
-rw-r--r--pkg/sentry/fsimpl/tmpfs/directory.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go44
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go79
-rw-r--r--pkg/sentry/kernel/fd_table.go8
-rw-r--r--pkg/sentry/kernel/pipe/BUILD2
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go6
-rw-r--r--pkg/sentry/kernel/pipe/pipe_unsafe.go35
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go219
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/BUILD2
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/inotify.go134
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/read_write.go36
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/splice.go286
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/vfs2.go12
-rw-r--r--pkg/sentry/vfs/BUILD15
-rw-r--r--pkg/sentry/vfs/anonfs.go12
-rw-r--r--pkg/sentry/vfs/dentry.go27
-rw-r--r--pkg/sentry/vfs/file_description.go5
-rw-r--r--pkg/sentry/vfs/inotify.go697
-rw-r--r--pkg/sentry/vfs/mount.go56
-rw-r--r--pkg/sentry/vfs/vfs.go5
-rw-r--r--pkg/tcpip/link/channel/channel.go2
-rw-r--r--pkg/tcpip/link/fdbased/endpoint.go2
-rw-r--r--pkg/tcpip/link/fdbased/endpoint_test.go2
-rw-r--r--pkg/tcpip/link/fdbased/mmap.go2
-rw-r--r--pkg/tcpip/link/fdbased/packet_dispatchers.go6
-rw-r--r--pkg/tcpip/link/loopback/loopback.go4
-rw-r--r--pkg/tcpip/link/muxed/injectable.go2
-rw-r--r--pkg/tcpip/link/qdisc/fifo/endpoint.go4
-rw-r--r--pkg/tcpip/link/sharedmem/sharedmem.go2
-rw-r--r--pkg/tcpip/link/sharedmem/sharedmem_test.go2
-rw-r--r--pkg/tcpip/link/sniffer/sniffer.go4
-rw-r--r--pkg/tcpip/link/waitable/waitable.go4
-rw-r--r--pkg/tcpip/link/waitable/waitable_test.go8
-rw-r--r--pkg/tcpip/stack/forwarder_test.go2
-rw-r--r--pkg/tcpip/stack/nic.go4
-rw-r--r--pkg/tcpip/stack/nic_test.go2
-rw-r--r--pkg/tcpip/stack/registration.go2
-rw-r--r--pkg/tcpip/transport/tcp/connect.go1
-rw-r--r--pkg/test/dockerutil/dockerutil.go110
-rw-r--r--runsc/boot/fs.go19
-rw-r--r--runsc/boot/vfs.go6
-rw-r--r--runsc/cgroup/BUILD2
-rw-r--r--runsc/cgroup/cgroup.go4
-rw-r--r--runsc/container/BUILD2
-rw-r--r--runsc/container/container.go7
-rw-r--r--runsc/container/container_test.go22
-rw-r--r--runsc/container/multi_container_test.go28
-rw-r--r--runsc/fsgofer/BUILD2
-rw-r--r--runsc/fsgofer/fsgofer.go8
-rw-r--r--runsc/sandbox/BUILD1
-rw-r--r--runsc/sandbox/sandbox.go3
-rw-r--r--runsc/specutils/specutils.go44
-rwxr-xr-xscripts/build.sh84
-rw-r--r--test/packetimpact/README.md21
-rw-r--r--test/packetimpact/netdevs/BUILD15
-rw-r--r--test/packetimpact/netdevs/netdevs.go104
-rw-r--r--test/packetimpact/runner/BUILD20
-rw-r--r--test/packetimpact/runner/defs.bzl (renamed from test/packetimpact/tests/defs.bzl)13
-rw-r--r--test/packetimpact/runner/packetimpact_test.go332
-rw-r--r--test/packetimpact/testbench/BUILD1
-rw-r--r--test/packetimpact/testbench/connections.go4
-rw-r--r--test/packetimpact/testbench/dut.go10
-rw-r--r--test/packetimpact/testbench/rawsockets.go3
-rw-r--r--test/packetimpact/testbench/testbench.go31
-rw-r--r--test/packetimpact/tests/BUILD40
-rw-r--r--test/packetimpact/tests/ipv4_id_uniqueness_test.go111
-rw-r--r--test/packetimpact/tests/tcp_synrcvd_reset_test.go52
-rw-r--r--test/packetimpact/tests/tcp_synsent_reset_test.go88
-rwxr-xr-xtest/packetimpact/tests/test_runner.sh325
-rw-r--r--test/root/BUILD1
-rw-r--r--test/root/crictl_test.go27
-rw-r--r--test/root/oom_score_adj_test.go30
-rw-r--r--test/runner/runner.go2
-rw-r--r--test/syscalls/linux/BUILD5
-rw-r--r--test/syscalls/linux/inotify.cc340
-rw-r--r--test/syscalls/linux/splice.cc49
-rw-r--r--tools/bazel.mk8
-rwxr-xr-xtools/go_branch.sh6
-rwxr-xr-xtools/make_apt.sh (renamed from tools/make_repository.sh)80
-rwxr-xr-xtools/make_release.sh82
-rw-r--r--website/_config.yml1
98 files changed, 3745 insertions, 773 deletions
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 60704f144..10c86f5cd 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -17,7 +17,14 @@ jobs:
-H "Content-Type: application/json" \
-H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
"${{ github.event.pull_request.statuses_url }}"
+ if: github.event_name == 'pull_request'
- uses: actions/checkout@v2
+ if: github.event_name == 'push'
+ with:
+ fetch-depth: 0
+ token: '${{ secrets.GO_TOKEN }}'
+ - uses: actions/checkout@v2
+ if: github.event_name == 'pull_request'
with:
fetch-depth: 0
- uses: actions/setup-go@v2
@@ -41,20 +48,16 @@ jobs:
- run: go build ./...
- if: github.event_name == 'push'
run: |
- # Required dedicated credentials for the Go branch, due to the way
- # branch protection rules are configured.
- git config --global credential.helper cache
- echo -e "protocol=https\nhost=github.com\nusername=${{ secrets.GO_TOKEN }}\npassword=x-oauth-basic" | git credential approve
git remote add upstream "https://github.com/${{ github.repository }}"
git push upstream go:go
- - if: ${{ success() }}
+ - if: ${{ success() && github.event_name == 'pull_request' }}
run: |
jq -nc '{"state": "success", "context": "go tests"}' | \
curl -sL -X POST -d @- \
-H "Content-Type: application/json" \
-H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
"${{ github.event.pull_request.statuses_url }}"
- - if: ${{ failure() }}
+ - if: ${{ failure() && github.event_name == 'pull_request' }}
run: |
jq -nc '{"state": "failure", "context": "go tests"}' | \
curl -sL -X POST -d @- \
diff --git a/Makefile b/Makefile
index 7f382695d..80cb6851b 100644
--- a/Makefile
+++ b/Makefile
@@ -116,7 +116,7 @@ unit-tests: ## Runs all unit tests in pkg runsc and tools.
.PHONY: unit-tests
tests: ## Runs all local ptrace system call tests.
- @$(MAKE) test OPTIONS="--test_tag_filter runsc_ptrace test/syscalls/..."
+ @$(MAKE) test OPTIONS="--test_tag_filters runsc_ptrace test/syscalls/..."
.PHONY: tests
##
@@ -152,6 +152,33 @@ website-deploy: website-push ## Deploy a new version of the website.
.PHONY: website-push
##
+## Repository builders.
+##
+## This builds a local apt repository. The following variables may be set:
+## RELEASE_ROOT - The repository root (default: "repo" directory).
+## RELEASE_KEY - The repository GPG private key file (default: dummy key is created).
+## RELEASE_NIGHTLY - Set to true if a nightly release (default: false).
+##
+RELEASE_ROOT := $(CURDIR)/repo
+RELEASE_KEY := repo.key
+RELEASE_NIGHTLY := false
+
+$(RELEASE_KEY):
+ @echo "WARNING: Generating a key for testing ($@); don't use this."
+ T=$$(mktemp /tmp/keyring.XXXXXX); \
+ gpg --no-default-keyring --keyring $$T --batch --passphrase "" --quick-generate-key $(shell whoami) && \
+ gpg --export-secret-keys --no-default-keyring --keyring $$T > $@; \
+ rc=$$?; rm -f $$T; exit $$rc
+
+release: $(RELEASE_KEY) ## Builds a release.
+ @mkdir -p $(RELEASE_ROOT)
+ @T=$$(mktemp -d /tmp/release.XXXXXX); \
+ $(MAKE) copy TARGETS="runsc runsc:runsc-debian" DESTINATION=$$T && \
+ NIGHTLY=$(RELEASE_NIGHTLY) tools/make_release.sh $(RELEASE_KEY) $(RELEASE_ROOT) $$T/*; \
+ rc=$$?; rm -rf $$T; exit $$rc
+.PHONY: release
+
+##
## Development helpers and tooling.
##
## These targets faciliate local development by automatically
diff --git a/README.md b/README.md
index 467630aaa..ce3947907 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,7 @@ development on this branch is not supported. Development should occur on the
## Community & Governance
-See [GOVERNANCE.md](GOVERANCE.md) for project governance information.
+See [GOVERNANCE.md](GOVERNANCE.md) for project governance information.
The [gvisor-users mailing list][gvisor-users-list] and
[gvisor-dev mailing list][gvisor-dev-list] are good starting points for
diff --git a/g3doc/user_guide/tutorials/docker.md b/g3doc/user_guide/tutorials/docker.md
index c0a3db506..705560038 100644
--- a/g3doc/user_guide/tutorials/docker.md
+++ b/g3doc/user_guide/tutorials/docker.md
@@ -1,4 +1,4 @@
-# WorkPress with Docker
+# WordPress with Docker
This page shows you how to deploy a sample [WordPress][wordpress] site using
[Docker][docker].
diff --git a/images/jekyll/Dockerfile b/images/jekyll/Dockerfile
index cefd949a6..4860dd750 100644
--- a/images/jekyll/Dockerfile
+++ b/images/jekyll/Dockerfile
@@ -8,5 +8,6 @@ RUN gem install \
jekyll-paginate:1.1.0 \
kramdown-parser-gfm:1.1.0 \
jekyll-relative-links:0.6.1 \
- jekyll-feed:0.13.0
+ jekyll-feed:0.13.0 \
+ jekyll-sitemap:1.4.0
CMD ["/usr/gem/gems/jekyll-4.0.0/exe/jekyll", "build", "-t", "-s", "/input", "-d", "/output"]
diff --git a/pkg/abi/linux/ip.go b/pkg/abi/linux/ip.go
index 31e56ffa6..ef6d1093e 100644
--- a/pkg/abi/linux/ip.go
+++ b/pkg/abi/linux/ip.go
@@ -92,6 +92,16 @@ const (
IP_UNICAST_IF = 50
)
+// IP_MTU_DISCOVER values from uapi/linux/in.h
+const (
+ IP_PMTUDISC_DONT = 0
+ IP_PMTUDISC_WANT = 1
+ IP_PMTUDISC_DO = 2
+ IP_PMTUDISC_PROBE = 3
+ IP_PMTUDISC_INTERFACE = 4
+ IP_PMTUDISC_OMIT = 5
+)
+
// Socket options from uapi/linux/in6.h
const (
IPV6_ADDRFORM = 1
diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go
index 0e5b86344..b789e56e9 100644
--- a/pkg/buffer/safemem.go
+++ b/pkg/buffer/safemem.go
@@ -28,12 +28,11 @@ func (b *buffer) ReadBlock() safemem.Block {
return safemem.BlockFromSafeSlice(b.ReadSlice())
}
-// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
-//
-// This will advance the write index.
-func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
- need := int(srcs.NumBytes())
- if need == 0 {
+// WriteFromSafememReader writes up to count bytes from r to v and advances the
+// write index by the number of bytes written. It calls r.ReadToBlocks() at
+// most once.
+func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, error) {
+ if count == 0 {
return 0, nil
}
@@ -50,32 +49,33 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
}
// Does the last block have sufficient capacity alone?
- if l := firstBuf.WriteSize(); l >= need {
- dst = safemem.BlockSeqOf(firstBuf.WriteBlock())
+ if l := uint64(firstBuf.WriteSize()); l >= count {
+ dst = safemem.BlockSeqOf(firstBuf.WriteBlock().TakeFirst64(count))
} else {
// Append blocks until sufficient.
- need -= l
+ count -= l
blocks = append(blocks, firstBuf.WriteBlock())
- for need > 0 {
+ for count > 0 {
emptyBuf := bufferPool.Get().(*buffer)
v.data.PushBack(emptyBuf)
- need -= emptyBuf.WriteSize()
- blocks = append(blocks, emptyBuf.WriteBlock())
+ block := emptyBuf.WriteBlock().TakeFirst64(count)
+ count -= uint64(block.Len())
+ blocks = append(blocks, block)
}
dst = safemem.BlockSeqFromSlice(blocks)
}
- // Perform the copy.
- n, err := safemem.CopySeq(dst, srcs)
+ // Perform I/O.
+ n, err := r.ReadToBlocks(dst)
v.size += int64(n)
// Update all indices.
- for left := int(n); left > 0; firstBuf = firstBuf.Next() {
- if l := firstBuf.WriteSize(); left >= l {
+ for left := n; left > 0; firstBuf = firstBuf.Next() {
+ if l := firstBuf.WriteSize(); left >= uint64(l) {
firstBuf.WriteMove(l) // Whole block.
- left -= l
+ left -= uint64(l)
} else {
- firstBuf.WriteMove(left) // Partial block.
+ firstBuf.WriteMove(int(left)) // Partial block.
left = 0
}
}
@@ -83,14 +83,16 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
return n, err
}
-// ReadToBlocks implements safemem.Reader.ReadToBlocks.
-//
-// This will not advance the read index; the caller should follow
-// this call with a call to TrimFront in order to remove the read
-// data from the buffer. This is done to support pipe sematics.
-func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
- need := int(dsts.NumBytes())
- if need == 0 {
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. It advances the
+// write index by the number of bytes written.
+func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+ return v.WriteFromSafememReader(&safemem.BlockSeqReader{srcs}, srcs.NumBytes())
+}
+
+// ReadToSafememWriter reads up to count bytes from v to w. It does not advance
+// the read index. It calls w.WriteFromBlocks() at most once.
+func (v *View) ReadToSafememWriter(w safemem.Writer, count uint64) (uint64, error) {
+ if count == 0 {
return 0, nil
}
@@ -105,25 +107,27 @@ func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
}
// Is all the data in a single block?
- if l := firstBuf.ReadSize(); l >= need {
- src = safemem.BlockSeqOf(firstBuf.ReadBlock())
+ if l := uint64(firstBuf.ReadSize()); l >= count {
+ src = safemem.BlockSeqOf(firstBuf.ReadBlock().TakeFirst64(count))
} else {
// Build a list of all the buffers.
- need -= l
+ count -= l
blocks = append(blocks, firstBuf.ReadBlock())
- for buf := firstBuf.Next(); buf != nil && need > 0; buf = buf.Next() {
- need -= buf.ReadSize()
- blocks = append(blocks, buf.ReadBlock())
+ for buf := firstBuf.Next(); buf != nil && count > 0; buf = buf.Next() {
+ block := buf.ReadBlock().TakeFirst64(count)
+ count -= uint64(block.Len())
+ blocks = append(blocks, block)
}
src = safemem.BlockSeqFromSlice(blocks)
}
- // Perform the copy.
- n, err := safemem.CopySeq(dsts, src)
-
- // See above: we would normally advance the read index here, but we
- // don't do that in order to support pipe semantics. We rely on a
- // separate call to TrimFront() in this case.
+ // Perform I/O. As documented, we don't advance the read index.
+ return w.WriteFromBlocks(src)
+}
- return n, err
+// ReadToBlocks implements safemem.Reader.ReadToBlocks. It does not advance the
+// read index by the number of bytes read, such that it's only safe to call if
+// the caller guarantees that ReadToBlocks will only be called once.
+func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ return v.ReadToSafememWriter(&safemem.BlockSeqWriter{dsts}, dsts.NumBytes())
}
diff --git a/pkg/cleanup/BUILD b/pkg/cleanup/BUILD
new file mode 100644
index 000000000..5c34b9872
--- /dev/null
+++ b/pkg/cleanup/BUILD
@@ -0,0 +1,17 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "cleanup",
+ srcs = ["cleanup.go"],
+ visibility = ["//:sandbox"],
+ deps = [
+ ],
+)
+
+go_test(
+ name = "cleanup_test",
+ srcs = ["cleanup_test.go"],
+ library = ":cleanup",
+)
diff --git a/pkg/cleanup/cleanup.go b/pkg/cleanup/cleanup.go
new file mode 100644
index 000000000..14a05f076
--- /dev/null
+++ b/pkg/cleanup/cleanup.go
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cleanup provides utilities to clean "stuff" on defers.
+package cleanup
+
+// Cleanup allows defers to be aborted when cleanup needs to happen
+// conditionally. Usage:
+// cu := cleanup.Make(func() { f.Close() })
+// defer cu.Clean() // failure before release is called will close the file.
+// ...
+// cu.Add(func() { f2.Close() }) // Adds another cleanup function
+// ...
+// cu.Release() // on success, aborts closing the file.
+// return f
+type Cleanup struct {
+ cleaners []func()
+}
+
+// Make creates a new Cleanup object.
+func Make(f func()) Cleanup {
+ return Cleanup{cleaners: []func(){f}}
+}
+
+// Add adds a new function to be called on Clean().
+func (c *Cleanup) Add(f func()) {
+ c.cleaners = append(c.cleaners, f)
+}
+
+// Clean calls all cleanup functions in reverse order.
+func (c *Cleanup) Clean() {
+ clean(c.cleaners)
+ c.cleaners = nil
+}
+
+// Release releases the cleanup from its duties, i.e. cleanup functions are not
+// called after this point. Returns a function that calls all registered
+// functions in case the caller has use for them.
+func (c *Cleanup) Release() func() {
+ old := c.cleaners
+ c.cleaners = nil
+ return func() { clean(old) }
+}
+
+func clean(cleaners []func()) {
+ for i := len(cleaners) - 1; i >= 0; i-- {
+ cleaners[i]()
+ }
+}
diff --git a/pkg/cleanup/cleanup_test.go b/pkg/cleanup/cleanup_test.go
new file mode 100644
index 000000000..ab3d9ed95
--- /dev/null
+++ b/pkg/cleanup/cleanup_test.go
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cleanup
+
+import "testing"
+
+func testCleanupHelper(clean, cleanAdd *bool, release bool) func() {
+ cu := Make(func() {
+ *clean = true
+ })
+ cu.Add(func() {
+ *cleanAdd = true
+ })
+ defer cu.Clean()
+ if release {
+ return cu.Release()
+ }
+ return nil
+}
+
+func TestCleanup(t *testing.T) {
+ clean := false
+ cleanAdd := false
+ testCleanupHelper(&clean, &cleanAdd, false)
+ if !clean {
+ t.Fatalf("cleanup function was not called.")
+ }
+ if !cleanAdd {
+ t.Fatalf("added cleanup function was not called.")
+ }
+}
+
+func TestRelease(t *testing.T) {
+ clean := false
+ cleanAdd := false
+ cleaner := testCleanupHelper(&clean, &cleanAdd, true)
+
+ // Check that clean was not called after release.
+ if clean {
+ t.Fatalf("cleanup function was called.")
+ }
+ if cleanAdd {
+ t.Fatalf("added cleanup function was called.")
+ }
+
+ // Call the cleaner function and check that both cleanup functions are called.
+ cleaner()
+ if !clean {
+ t.Fatalf("cleanup function was not called.")
+ }
+ if !cleanAdd {
+ t.Fatalf("added cleanup function was not called.")
+ }
+}
diff --git a/pkg/sentry/fs/g3doc/.gitignore b/pkg/sentry/fs/g3doc/.gitignore
new file mode 100644
index 000000000..2d19fc766
--- /dev/null
+++ b/pkg/sentry/fs/g3doc/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
new file mode 100644
index 000000000..b43c082a7
--- /dev/null
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -0,0 +1,262 @@
+# Foreword
+
+This document describes an on-going project to support FUSE filesystems within
+the sentry. This is intended to become the final documentation for this
+subsystem, and is therefore written in the past tense. However FUSE support is
+currently incomplete and the document will be updated as things progress.
+
+# FUSE: Filesystem in Userspace
+
+The sentry supports dispatching filesystem operations to a FUSE server, allowing
+FUSE filesystem to be used with a sandbox.
+
+## Overview
+
+FUSE has two main components:
+
+1. A client kernel driver (canonically `fuse.ko` in Linux), which forwards
+ filesystem operations (usually initiated by syscalls) to the server.
+
+2. A server, which is a userspace daemon that implements the actual filesystem.
+
+The sentry implements the client component, which allows a server daemon running
+within the sandbox to implement a filesystem within the sandbox.
+
+A FUSE filesystem is initialized with `mount(2)`, typically with the help of a
+utility like `fusermount(1)`. Various mount options exist for establishing
+ownership and access permissions on the filesystem, but the most important mount
+option is a file descriptor used to establish communication between the client
+and server.
+
+The FUSE device FD is obtained by opening `/dev/fuse`. During regular operation,
+the client and server use the FUSE protocol described in `fuse(4)` to service
+filesystem operations. See the "Protocol" section below for more information
+about this protocol. The core of the sentry support for FUSE is the client-side
+implementation of this protocol.
+
+## FUSE in the Sentry
+
+The sentry's FUSE client targets VFS2 and has the following components:
+
+- An implementation of `/dev/fuse`.
+
+- A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting
+ VFS2, one point of contention may be the lack of inodes in VFS2. We can
+ tentatively implement a kernfs-based filesystem to bridge the gap in APIs.
+ The kernfs base functionality can serve the role of the Linux inode cache
+ and, the filesystem can map VFS2 syscalls to kernfs inode operations; see
+ the `kernfs.Inode` interface.
+
+The FUSE protocol lends itself well to marshaling with `go_marshal`. The various
+request and response packets can be defined in the ABI package and converted to
+and from the wire format using `go_marshal`.
+
+### Design Goals
+
+- While filesystem performance is always important, the sentry's FUSE support
+ is primarily concerned with compatibility, with performance as a secondary
+ concern.
+
+- Avoiding deadlocks from a hung server daemon.
+
+- Consider the potential for denial of service from a malicious server daemon.
+ Protecting itself from userspace is already a design goal for the sentry,
+ but needs additional consideration for FUSE. Normally, an operating system
+ doesn't rely on userspace to make progress with filesystem operations. Since
+ this changes with FUSE, it opens up the possibility of creating a chain of
+ dependencies controlled by userspace, which could affect an entire sandbox.
+ For example: a FUSE op can block a syscall, which could be holding a
+ subsystem lock, which can then block another task goroutine.
+
+### Milestones
+
+Below are some broad goals to aim for while implementing FUSE in the sentry.
+Many FUSE ops can be grouped into broad categories of functionality, and most
+ops can be implemented in parallel.
+
+#### Minimal client that can mount a trivial FUSE filesystem.
+
+- Implement `/dev/fuse`.
+
+- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
+
+#### Read-only mount with basic file operations
+
+- Implement the majority of file, directory and file descriptor FUSE ops. For
+ this milestone, we can skip uncommon or complex operations like mmap, mknod,
+ file locking, poll, and extended attributes. We can stub these out along
+ with any ops that modify the filesystem. The exact list of required ops are
+ to be determined, but the goal is to mount a real filesystem as read-only,
+ and be able to read contents from the filesystem in the sentry.
+
+#### Full read-write support
+
+- Implement the remaining FUSE ops and decide if we can omit rarely used
+ operations like ioctl.
+
+# Appendix
+
+## FUSE Protocol
+
+The FUSE protocol is a request-response protocol. All requests are initiated by
+the client. The wire-format for the protocol is raw C structs serialized to
+memory.
+
+All FUSE requests begin with the following request header:
+
+```c
+struct fuse_in_header {
+ uint32_t len; // Length of the request, including this header.
+ uint32_t opcode; // Requested operation.
+ uint64_t unique; // A unique identifier for this request.
+ uint64_t nodeid; // ID of the filesystem object being operated on.
+ uint32_t uid; // UID of the requesting process.
+ uint32_t gid; // GID of the requesting process.
+ uint32_t pid; // PID of the requesting process.
+ uint32_t padding;
+};
+```
+
+The request is then followed by a payload specific to the `opcode`.
+
+All responses begin with this response header:
+
+```c
+struct fuse_out_header {
+ uint32_t len; // Length of the response, including this header.
+ int32_t error; // Status of the request, 0 if success.
+ uint64_t unique; // The unique identifier from the corresponding request.
+};
+```
+
+The response payload also depends on the request `opcode`. If `error != 0`, the
+response payload must be empty.
+
+### Operations
+
+The following is a list of all FUSE operations used in `fuse_in_header.opcode`
+as of Linux v4.4, and a brief description of their purpose. These are defined in
+`uapi/linux/fuse.h`. Many of these have a corresponding request and response
+payload struct; `fuse(4)` has details for some of these. We also note how these
+operations map to the sentry virtual filesystem.
+
+#### FUSE meta-operations
+
+These operations are specific to FUSE and don't have a corresponding action in a
+generic filesystem.
+
+- `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the
+ first message sent by the client after mount. This is used for version and
+ feature negotiation. This is related to `mount(2)`.
+- `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`.
+- `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the
+ `fuse_in_header.unique` value provided in the corresponding request header.
+ The client can send at most one of these per request, and will enter an
+ uninterruptible wait for a reply. The server is expected to reply promptly.
+- `FUSE_FORGET`: A hint to the server that server should evict the indicate
+ node from any caches. This is wired up to `(struct
+ super_operations).evict_inode` in Linux, which is in turned hooked as the
+ inode cache shrinker which is typically triggered by system memory pressure.
+- `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`.
+
+#### Filesystem Syscalls
+
+These FUSE ops map directly to an equivalent filesystem syscall, or family of
+syscalls. The relevant syscalls have a similar name to the operation, unless
+otherwise noted.
+
+Node creation:
+
+- `FUSE_MKNOD`
+- `FUSE_MKDIR`
+- `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which
+ atomically creates and opens a node.
+
+Node attributes and extended attributes:
+
+- `FUSE_GETATTR`
+- `FUSE_SETATTR`
+- `FUSE_SETXATTR`
+- `FUSE_GETXATTR`
+- `FUSE_LISTXATTR`
+- `FUSE_REMOVEXATTR`
+
+Node link manipulation:
+
+- `FUSE_READLINK`
+- `FUSE_LINK`
+- `FUSE_SYMLINK`
+- `FUSE_UNLINK`
+
+Directory operations:
+
+- `FUSE_RMDIR`
+- `FUSE_RENAME`
+- `FUSE_RENAME2`
+- `FUSE_OPENDIR`: `open(2)` for directories.
+- `FUSE_RELEASEDIR`: `close(2)` for directories.
+- `FUSE_READDIR`
+- `FUSE_READDIRPLUS`
+- `FUSE_FSYNCDIR`: `fsync(2)` for directories.
+- `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is
+ reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path
+ component to a node. However the returned identifier is opaque to the
+ client. The server must remember this mapping, as this is how the client
+ will reference the node in the future.
+
+File operations:
+
+- `FUSE_OPEN`: `open(2)` for files.
+- `FUSE_RELEASE`: `close(2)` for files.
+- `FUSE_FSYNC`
+- `FUSE_FALLOCATE`
+- `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`.
+- `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`.
+
+File locking:
+
+- `FUSE_GETLK`
+- `FUSE_SETLK`
+- `FUSE_SETLKW`
+- `FUSE_COPY_FILE_RANGE`
+
+File descriptor operations:
+
+- `FUSE_IOCTL`
+- `FUSE_POLL`
+- `FUSE_LSEEK`
+
+Filesystem operations:
+
+- `FUSE_STATFS`
+
+#### Permissions
+
+- `FUSE_ACCESS` is used to check if a node is accessible, as part of many
+ syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt` in the
+ sentry.
+
+#### I/O Operations
+
+These ops are used to read and write file pages. They're used to implement both
+I/O syscalls like `read(2)`, `write(2)` and `mmap(2)`.
+
+- `FUSE_READ`
+- `FUSE_WRITE`
+
+#### Miscellaneous
+
+- `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is
+ closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)`
+ syscall from the user. Maps to `vfs.FileDescriptorImpl.Release` in the
+ sentry.
+- `FUSE_BMAP`: Old address space API for block defrag. Probably not needed.
+- `FUSE_NOTIFY_REPLY`: [TODO: what does this do?]
+
+# References
+
+- [fuse(4) Linux manual page](https://www.man7.org/linux/man-pages/man4/fuse.4.html)
+- [Linux kernel FUSE documentation](https://www.kernel.org/doc/html/latest/filesystems/fuse.html)
+- [The reference implementation of the Linux FUSE (Filesystem in Userspace)
+ interface](https://github.com/libfuse/libfuse)
+- [The kernel interface of FUSE](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fuse.h)
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index bfbd7c3d4..6bd1a9fc6 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -60,3 +60,15 @@ func (d *dentry) DecRef() {
// inode.decRef().
d.inode.decRef()
}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) Watches() *vfs.Watches {
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 6295f6b54..3f3bd56f0 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -84,12 +84,6 @@ type filesystem struct {
// devMinor is the filesystem's minor device number. devMinor is immutable.
devMinor uint32
- // uid and gid are the effective KUID and KGID of the filesystem's creator,
- // and are used as the owner and group for files that don't specify one.
- // uid and gid are immutable.
- uid auth.KUID
- gid auth.KGID
-
// renameMu serves two purposes:
//
// - It synchronizes path resolution with renaming initiated by this
@@ -122,6 +116,8 @@ type filesystemOptions struct {
fd int
aname string
interop InteropMode // derived from the "cache" mount option
+ dfltuid auth.KUID
+ dfltgid auth.KGID
msize uint32
version string
@@ -230,6 +226,15 @@ type InternalFilesystemOptions struct {
OpenSocketsByConnecting bool
}
+// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
+// UIDs and GIDs used for files that do not provide a specific owner or group
+// respectively.
+const (
+ // uint32(-2) doesn't work in Go.
+ _V9FS_DEFUID = auth.KUID(4294967294)
+ _V9FS_DEFGID = auth.KGID(4294967294)
+)
+
// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
return Name
@@ -315,6 +320,31 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
}
+ // Parse the default UID and GID.
+ fsopts.dfltuid = _V9FS_DEFUID
+ if dfltuidstr, ok := mopts["dfltuid"]; ok {
+ delete(mopts, "dfltuid")
+ dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
+ if err != nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr)
+ return nil, nil, syserror.EINVAL
+ }
+ // In Linux, dfltuid is interpreted as a UID and is converted to a KUID
+ // in the caller's user namespace, but goferfs isn't
+ // application-mountable.
+ fsopts.dfltuid = auth.KUID(dfltuid)
+ }
+ fsopts.dfltgid = _V9FS_DEFGID
+ if dfltgidstr, ok := mopts["dfltgid"]; ok {
+ delete(mopts, "dfltgid")
+ dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
+ if err != nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr)
+ return nil, nil, syserror.EINVAL
+ }
+ fsopts.dfltgid = auth.KGID(dfltgid)
+ }
+
// Parse the 9P message size.
fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
if msizestr, ok := mopts["msize"]; ok {
@@ -422,8 +452,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
client: client,
clock: ktime.RealtimeClockFromContext(ctx),
devMinor: devMinor,
- uid: creds.EffectiveKUID,
- gid: creds.EffectiveKGID,
syncableDentries: make(map[*dentry]struct{}),
specialFileFDs: make(map[*specialFileFD]struct{}),
}
@@ -672,8 +700,8 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
file: file,
ino: qid.Path,
mode: uint32(attr.Mode),
- uid: uint32(fs.uid),
- gid: uint32(fs.gid),
+ uid: uint32(fs.opts.dfltuid),
+ gid: uint32(fs.opts.dfltgid),
blockSize: usermem.PageSize,
handle: handle{
fd: -1,
@@ -1011,6 +1039,18 @@ func (d *dentry) decRefLocked() {
}
}
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) Watches() *vfs.Watches {
+ return nil
+}
+
// checkCachingLocked should be called after d's reference count becomes 0 or it
// becomes disowned.
//
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index a83151ad3..bbee8ccda 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -225,9 +225,21 @@ func (d *Dentry) destroy() {
}
}
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *Dentry) Watches() *vfs.Watches {
+ return nil
+}
+
// InsertChild inserts child into the vfs dentry cache with the given name under
// this dentry. This does not update the directory inode, so calling this on
-// it's own isn't sufficient to insert a child into a directory. InsertChild
+// its own isn't sufficient to insert a child into a directory. InsertChild
// updates the link count on d if required.
//
// Precondition: d must represent a directory inode.
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 007be1572..062321cbc 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -59,6 +59,7 @@ go_library(
"//pkg/sentry/pgalloc",
"//pkg/sentry/platform",
"//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/uniqueid",
"//pkg/sentry/usage",
"//pkg/sentry/vfs",
"//pkg/sentry/vfs/lock",
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index f2399981b..70387cb9c 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -79,6 +79,7 @@ func (dir *directory) removeChildLocked(child *dentry) {
dir.iterMu.Lock()
dir.childList.Remove(child)
dir.iterMu.Unlock()
+ child.unlinked = true
}
type directoryFD struct {
@@ -112,6 +113,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
dir.iterMu.Lock()
defer dir.iterMu.Unlock()
+ fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
fd.inode().touchAtime(fd.vfsfd.Mount())
if fd.off == 0 {
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 80fa7b29d..183eb975c 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -177,6 +177,12 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
if err := create(parentDir, name); err != nil {
return err
}
+
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent)
parentDir.inode.touchCMtime()
return nil
}
@@ -241,6 +247,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return syserror.EMLINK
}
d.inode.incLinksLocked()
+ d.inode.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent)
parentDir.insertChildLocked(fs.newDentry(d.inode), name)
return nil
})
@@ -354,6 +361,7 @@ afterTrailingSymlink:
if err != nil {
return nil, err
}
+ parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent)
parentDir.inode.touchCMtime()
return fd, nil
}
@@ -559,6 +567,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
newParentDir.inode.touchCMtime()
}
renamed.inode.touchCtime()
+
+ vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
return nil
}
@@ -603,8 +613,11 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
parentDir.removeChildLocked(child)
- parentDir.inode.decLinksLocked() // from child's ".."
+ parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent)
+ // Remove links for child, child/., and child/..
child.inode.decLinksLocked()
+ child.inode.decLinksLocked()
+ parentDir.inode.decLinksLocked()
vfsObj.CommitDeleteDentry(&child.vfsd)
parentDir.inode.touchCMtime()
return nil
@@ -618,7 +631,14 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
if err != nil {
return err
}
- return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat)
+ if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil {
+ return err
+ }
+
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+ }
+ return nil
}
// StatAt implements vfs.FilesystemImpl.StatAt.
@@ -698,6 +718,12 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
return err
}
+
+ // Generate inotify events. Note that this must take place before the link
+ // count of the child is decremented, or else the watches may be dropped
+ // before these events are added.
+ vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name)
+
parentDir.removeChildLocked(child)
child.inode.decLinksLocked()
vfsObj.CommitDeleteDentry(&child.vfsd)
@@ -754,7 +780,12 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
if err != nil {
return err
}
- return d.inode.setxattr(rp.Credentials(), &opts)
+ if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
+ return err
+ }
+
+ d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
@@ -765,7 +796,12 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
if err != nil {
return err
}
- return d.inode.removexattr(rp.Credentials(), name)
+ if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
+ return err
+ }
+
+ d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 3f433d666..fee174375 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -312,7 +312,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
f := fd.inode().impl.(*regularFile)
if end := offset + srclen; end < offset {
// Overflow.
- return 0, syserror.EFBIG
+ return 0, syserror.EINVAL
}
var err error
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 1e781aecd..f0e098702 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -163,6 +163,11 @@ type dentry struct {
// filesystem.mu.
name string
+ // unlinked indicates whether this dentry has been unlinked from its parent.
+ // It is only set to true on an unlink operation, and never set from true to
+ // false. unlinked is protected by filesystem.mu.
+ unlinked bool
+
// dentryEntry (ugh) links dentries into their parent directory.childList.
dentryEntry
@@ -201,6 +206,26 @@ func (d *dentry) DecRef() {
d.inode.decRef()
}
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {
+ if d.inode.isDir() {
+ events |= linux.IN_ISDIR
+ }
+
+ // The ordering below is important, Linux always notifies the parent first.
+ if d.parent != nil {
+ // Note that d.parent or d.name may be stale if there is a concurrent
+ // rename operation. Inotify does not provide consistency guarantees.
+ d.parent.inode.watches.NotifyWithExclusions(d.name, events, cookie, et, d.unlinked)
+ }
+ d.inode.watches.Notify("", events, cookie, et)
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+ return &d.inode.watches
+}
+
// inode represents a filesystem object.
type inode struct {
// fs is the owning filesystem. fs is immutable.
@@ -209,11 +234,9 @@ type inode struct {
// refs is a reference count. refs is accessed using atomic memory
// operations.
//
- // A reference is held on all inodes that are reachable in the filesystem
- // tree. For non-directories (which may have multiple hard links), this
- // means that a reference is dropped when nlink reaches 0. For directories,
- // nlink never reaches 0 due to the "." entry; instead,
- // filesystem.RmdirAt() drops the reference.
+ // A reference is held on all inodes as long as they are reachable in the
+ // filesystem tree, i.e. nlink is nonzero. This reference is dropped when
+ // nlink reaches 0.
refs int64
// xattrs implements extended attributes.
@@ -238,6 +261,9 @@ type inode struct {
// Advisory file locks, which lock at the inode level.
locks lock.FileLocks
+ // Inotify watches for this inode.
+ watches vfs.Watches
+
impl interface{} // immutable
}
@@ -259,6 +285,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials,
i.ctime = now
i.mtime = now
// i.nlink initialized by caller
+ i.watches = vfs.Watches{}
i.impl = impl
}
@@ -276,14 +303,17 @@ func (i *inode) incLinksLocked() {
atomic.AddUint32(&i.nlink, 1)
}
-// decLinksLocked decrements i's link count.
+// decLinksLocked decrements i's link count. If the link count reaches 0, we
+// remove a reference on i as well.
//
// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
func (i *inode) decLinksLocked() {
if i.nlink == 0 {
panic("tmpfs.inode.decLinksLocked() called with no existing links")
}
- atomic.AddUint32(&i.nlink, ^uint32(0))
+ if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 {
+ i.decRef()
+ }
}
func (i *inode) incRef() {
@@ -306,6 +336,7 @@ func (i *inode) tryIncRef() bool {
func (i *inode) decRef() {
if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+ i.watches.HandleDeletion()
if regFile, ok := i.impl.(*regularFile); ok {
// Release memory used by regFile to store data. Since regFile is
// no longer usable, we don't need to grab any locks or update any
@@ -627,8 +658,12 @@ func (fd *fileDescription) filesystem() *filesystem {
return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
}
+func (fd *fileDescription) dentry() *dentry {
+ return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
func (fd *fileDescription) inode() *inode {
- return fd.vfsfd.Dentry().Impl().(*dentry).inode
+ return fd.dentry().inode
}
// Stat implements vfs.FileDescriptionImpl.Stat.
@@ -641,7 +676,15 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
creds := auth.CredentialsFromContext(ctx)
- return fd.inode().setStat(ctx, creds, &opts.Stat)
+ d := fd.dentry()
+ if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil {
+ return err
+ }
+
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+ }
+ return nil
}
// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
@@ -656,12 +699,26 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption
// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
- return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts)
+ d := fd.dentry()
+ if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+ return err
+ }
+
+ // Generate inotify events.
+ d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
- return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name)
+ d := fd.dentry()
+ if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+ return err
+ }
+
+ // Generate inotify events.
+ d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// NewMemfd creates a new tmpfs regular file and file description that can back
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index ed40b5303..dbfcef0fa 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -152,7 +152,13 @@ func (f *FDTable) drop(file *fs.File) {
// dropVFS2 drops the table reference.
func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
// TODO(gvisor.dev/issue/1480): Release locks.
- // TODO(gvisor.dev/issue/1479): Send inotify events.
+
+ // Generate inotify events.
+ ev := uint32(linux.IN_CLOSE_NOWRITE)
+ if file.IsWritable() {
+ ev = linux.IN_CLOSE_WRITE
+ }
+ file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent)
// Drop the table reference.
file.DecRef()
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index f29dc0472..7bfa9075a 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -8,6 +8,7 @@ go_library(
"device.go",
"node.go",
"pipe.go",
+ "pipe_unsafe.go",
"pipe_util.go",
"reader.go",
"reader_writer.go",
@@ -20,6 +21,7 @@ go_library(
"//pkg/amutex",
"//pkg/buffer",
"//pkg/context",
+ "//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/device",
"//pkg/sentry/fs",
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 62c8691f1..79645d7d2 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -207,7 +207,10 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
p.mu.Lock()
defer p.mu.Unlock()
+ return p.readLocked(ctx, ops)
+}
+func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
// Is the pipe empty?
if p.view.Size() == 0 {
if !p.HasWriters() {
@@ -246,7 +249,10 @@ type writeOps struct {
func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
p.mu.Lock()
defer p.mu.Unlock()
+ return p.writeLocked(ctx, ops)
+}
+func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) {
// Can't write to a pipe with no readers.
if !p.HasReaders() {
return 0, syscall.EPIPE
diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go
new file mode 100644
index 000000000..dd60cba24
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+ "unsafe"
+)
+
+// lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be
+// consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that
+// concurrent calls cannot deadlock.
+//
+// Preconditions: x != y.
+func lockTwoPipes(x, y *Pipe) {
+ // Lock the two pipes in order of increasing address.
+ if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) {
+ x.mu.Lock()
+ y.mu.Lock()
+ } else {
+ y.mu.Lock()
+ x.mu.Lock()
+ }
+}
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index b54f08a30..2602bed72 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -16,7 +16,9 @@ package pipe
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/buffer"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
@@ -150,7 +152,9 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *
return &fd.vfsfd
}
-// VFSPipeFD implements vfs.FileDescriptionImpl for pipes.
+// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
+// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
+// other FileDescriptions for splice(2) and tee(2).
type VFSPipeFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -229,3 +233,216 @@ func (fd *VFSPipeFD) PipeSize() int64 {
func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
return fd.pipe.SetFifoSize(size)
}
+
+// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
+// or writes up to count bytes to, fd.
+func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
+ return usermem.IOSequence{
+ IO: fd,
+ Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+ }
+}
+
+// CopyIn implements usermem.IO.CopyIn.
+func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+ origCount := int64(len(dst))
+ n, err := fd.pipe.read(ctx, readOps{
+ left: func() int64 {
+ return int64(len(dst))
+ },
+ limit: func(l int64) {
+ dst = dst[:l]
+ },
+ read: func(view *buffer.View) (int64, error) {
+ n, err := view.ReadAt(dst, 0)
+ view.TrimFront(int64(n))
+ return int64(n), err
+ },
+ })
+ if n > 0 {
+ fd.pipe.Notify(waiter.EventOut)
+ }
+ if err == nil && n != origCount {
+ return int(n), syserror.ErrWouldBlock
+ }
+ return int(n), err
+}
+
+// CopyOut implements usermem.IO.CopyOut.
+func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+ origCount := int64(len(src))
+ n, err := fd.pipe.write(ctx, writeOps{
+ left: func() int64 {
+ return int64(len(src))
+ },
+ limit: func(l int64) {
+ src = src[:l]
+ },
+ write: func(view *buffer.View) (int64, error) {
+ view.Append(src)
+ return int64(len(src)), nil
+ },
+ })
+ if n > 0 {
+ fd.pipe.Notify(waiter.EventIn)
+ }
+ if err == nil && n != origCount {
+ return int(n), syserror.ErrWouldBlock
+ }
+ return int(n), err
+}
+
+// ZeroOut implements usermem.IO.ZeroOut.
+func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+ origCount := toZero
+ n, err := fd.pipe.write(ctx, writeOps{
+ left: func() int64 {
+ return toZero
+ },
+ limit: func(l int64) {
+ toZero = l
+ },
+ write: func(view *buffer.View) (int64, error) {
+ view.Grow(view.Size()+toZero, true /* zero */)
+ return toZero, nil
+ },
+ })
+ if n > 0 {
+ fd.pipe.Notify(waiter.EventIn)
+ }
+ if err == nil && n != origCount {
+ return n, syserror.ErrWouldBlock
+ }
+ return n, err
+}
+
+// CopyInTo implements usermem.IO.CopyInTo.
+func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+ count := ars.NumBytes()
+ if count == 0 {
+ return 0, nil
+ }
+ origCount := count
+ n, err := fd.pipe.read(ctx, readOps{
+ left: func() int64 {
+ return count
+ },
+ limit: func(l int64) {
+ count = l
+ },
+ read: func(view *buffer.View) (int64, error) {
+ n, err := view.ReadToSafememWriter(dst, uint64(count))
+ view.TrimFront(int64(n))
+ return int64(n), err
+ },
+ })
+ if n > 0 {
+ fd.pipe.Notify(waiter.EventOut)
+ }
+ if err == nil && n != origCount {
+ return n, syserror.ErrWouldBlock
+ }
+ return n, err
+}
+
+// CopyOutFrom implements usermem.IO.CopyOutFrom.
+func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+ count := ars.NumBytes()
+ if count == 0 {
+ return 0, nil
+ }
+ origCount := count
+ n, err := fd.pipe.write(ctx, writeOps{
+ left: func() int64 {
+ return count
+ },
+ limit: func(l int64) {
+ count = l
+ },
+ write: func(view *buffer.View) (int64, error) {
+ n, err := view.WriteFromSafememReader(src, uint64(count))
+ return int64(n), err
+ },
+ })
+ if n > 0 {
+ fd.pipe.Notify(waiter.EventIn)
+ }
+ if err == nil && n != origCount {
+ return n, syserror.ErrWouldBlock
+ }
+ return n, err
+}
+
+// SwapUint32 implements usermem.IO.SwapUint32.
+func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+ // How did a pipe get passed as the virtual address space to futex(2)?
+ panic("VFSPipeFD.SwapUint32 called unexpectedly")
+}
+
+// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
+func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+ panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly")
+}
+
+// LoadUint32 implements usermem.IO.LoadUint32.
+func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+ panic("VFSPipeFD.LoadUint32 called unexpectedly")
+}
+
+// Splice reads up to count bytes from src and writes them to dst. It returns
+// the number of bytes moved.
+//
+// Preconditions: count > 0.
+func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+ return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */)
+}
+
+// Tee reads up to count bytes from src and writes them to dst, without
+// removing the read bytes from src. It returns the number of bytes copied.
+//
+// Preconditions: count > 0.
+func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+ return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */)
+}
+
+// Preconditions: count > 0.
+func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) {
+ if dst.pipe == src.pipe {
+ return 0, syserror.EINVAL
+ }
+
+ lockTwoPipes(dst.pipe, src.pipe)
+ defer dst.pipe.mu.Unlock()
+ defer src.pipe.mu.Unlock()
+
+ n, err := dst.pipe.writeLocked(ctx, writeOps{
+ left: func() int64 {
+ return count
+ },
+ limit: func(l int64) {
+ count = l
+ },
+ write: func(dstView *buffer.View) (int64, error) {
+ return src.pipe.readLocked(ctx, readOps{
+ left: func() int64 {
+ return count
+ },
+ limit: func(l int64) {
+ count = l
+ },
+ read: func(srcView *buffer.View) (int64, error) {
+ n, err := srcView.ReadToSafememWriter(dstView, uint64(count))
+ if n > 0 && removeFromSrc {
+ srcView.TrimFront(int64(n))
+ }
+ return int64(n), err
+ },
+ })
+ },
+ })
+ if n > 0 {
+ dst.pipe.Notify(waiter.EventIn)
+ src.pipe.Notify(waiter.EventOut)
+ }
+ return n, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index f882ef840..9c8b44f64 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -12,6 +12,7 @@ go_library(
"filesystem.go",
"fscontext.go",
"getdents.go",
+ "inotify.go",
"ioctl.go",
"memfd.go",
"mmap.go",
@@ -22,6 +23,7 @@ go_library(
"setstat.go",
"signal.go",
"socket.go",
+ "splice.go",
"stat.go",
"stat_amd64.go",
"stat_arm64.go",
diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go
new file mode 100644
index 000000000..7d50b6a16
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go
@@ -0,0 +1,134 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := args[0].Int()
+ if flags&^allFlags != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags))
+ if err != nil {
+ return 0, nil, err
+ }
+ defer ino.DecRef()
+
+ fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{
+ CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+ })
+
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ args[0].Value = 0
+ return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) {
+ f := t.GetFileVFS2(fd)
+ if f == nil {
+ // Invalid fd.
+ return nil, nil, syserror.EBADF
+ }
+
+ ino, ok := f.Impl().(*vfs.Inotify)
+ if !ok {
+ // Not an inotify fd.
+ f.DecRef()
+ return nil, nil, syserror.EINVAL
+ }
+
+ return ino, f, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ mask := args[2].Uint()
+
+ // "EINVAL: The given event mask contains no valid events."
+ // -- inotify_add_watch(2)
+ if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+ // -- inotify(7)
+ follow := followFinalSymlink
+ if mask&linux.IN_DONT_FOLLOW == 0 {
+ follow = nofollowFinalSymlink
+ }
+
+ ino, f, err := fdToInotify(t, fd)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer f.DecRef()
+
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+ if mask&linux.IN_ONLYDIR != 0 {
+ path.Dir = true
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+ d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ return 0, nil, err
+ }
+ defer d.DecRef()
+
+ fd = ino.AddWatch(d.Dentry(), mask)
+ return uintptr(fd), nil, err
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ wd := args[1].Int()
+
+ ino, f, err := fdToInotify(t, fd)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer f.DecRef()
+ return 0, nil, ino.RmWatch(wd)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
index 3a7ef24f5..7f9debd4a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/read_write.go
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -93,11 +93,17 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
n, err := file.Read(t, dst, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return n, err
}
@@ -128,6 +134,9 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return total, err
}
@@ -248,11 +257,17 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
n, err := file.PRead(t, dst, offset, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return n, err
}
@@ -283,6 +298,9 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return total, err
}
@@ -345,11 +363,17 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
n, err := file.Write(t, src, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
return n, err
}
@@ -380,6 +404,9 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
return total, err
}
@@ -500,11 +527,17 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
n, err := file.PWrite(t, src, offset, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return n, err
}
@@ -535,6 +568,9 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return total, err
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
new file mode 100644
index 000000000..8f3c22a02
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -0,0 +1,286 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Splice implements Linux syscall splice(2).
+func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ inFD := args[0].Int()
+ inOffsetPtr := args[1].Pointer()
+ outFD := args[2].Int()
+ outOffsetPtr := args[3].Pointer()
+ count := int64(args[4].SizeT())
+ flags := args[5].Int()
+
+ if count == 0 {
+ return 0, nil, nil
+ }
+ if count > int64(kernel.MAX_RW_COUNT) {
+ count = int64(kernel.MAX_RW_COUNT)
+ }
+
+ // Check for invalid flags.
+ if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get file descriptions.
+ inFile := t.GetFileVFS2(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef()
+ outFile := t.GetFileVFS2(outFD)
+ if outFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer outFile.DecRef()
+
+ // Check that both files support the required directionality.
+ if !inFile.IsReadable() || !outFile.IsWritable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // The operation is non-blocking if anything is non-blocking.
+ //
+ // N.B. This is a rather simplistic heuristic that avoids some
+ // poor edge case behavior since the exact semantics here are
+ // underspecified and vary between versions of Linux itself.
+ nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+ // At least one file description must represent a pipe.
+ inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+ outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+ if !inIsPipe && !outIsPipe {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Copy in offsets.
+ inOffset := int64(-1)
+ if inOffsetPtr != 0 {
+ if inIsPipe {
+ return 0, nil, syserror.ESPIPE
+ }
+ if inFile.Options().DenyPRead {
+ return 0, nil, syserror.EINVAL
+ }
+ if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil {
+ return 0, nil, err
+ }
+ if inOffset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ }
+ outOffset := int64(-1)
+ if outOffsetPtr != 0 {
+ if outIsPipe {
+ return 0, nil, syserror.ESPIPE
+ }
+ if outFile.Options().DenyPWrite {
+ return 0, nil, syserror.EINVAL
+ }
+ if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil {
+ return 0, nil, err
+ }
+ if outOffset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ }
+
+ // Move data.
+ var (
+ n int64
+ err error
+ inCh chan struct{}
+ outCh chan struct{}
+ )
+ for {
+ // If both input and output are pipes, delegate to the pipe
+ // implementation. Otherwise, exactly one end is a pipe, which we
+ // ensure is consistently ordered after the non-pipe FD's locks by
+ // passing the pipe FD as usermem.IO to the non-pipe end.
+ switch {
+ case inIsPipe && outIsPipe:
+ n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
+ case inIsPipe:
+ if outOffset != -1 {
+ n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{})
+ outOffset += n
+ } else {
+ n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{})
+ }
+ case outIsPipe:
+ if inOffset != -1 {
+ n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{})
+ inOffset += n
+ } else {
+ n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
+ }
+ }
+ if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
+ break
+ }
+
+ // Note that the blocking behavior here is a bit different than the
+ // normal pattern. Because we need to have both data to read and data
+ // to write simultaneously, we actually explicitly block on both of
+ // these cases in turn before returning to the splice operation.
+ if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+ if inCh == nil {
+ inCh = make(chan struct{}, 1)
+ inW, _ := waiter.NewChannelEntry(inCh)
+ inFile.EventRegister(&inW, eventMaskRead)
+ defer inFile.EventUnregister(&inW)
+ continue // Need to refresh readiness.
+ }
+ if err = t.Block(inCh); err != nil {
+ break
+ }
+ }
+ if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+ if outCh == nil {
+ outCh = make(chan struct{}, 1)
+ outW, _ := waiter.NewChannelEntry(outCh)
+ outFile.EventRegister(&outW, eventMaskWrite)
+ defer outFile.EventUnregister(&outW)
+ continue // Need to refresh readiness.
+ }
+ if err = t.Block(outCh); err != nil {
+ break
+ }
+ }
+ }
+
+ // Copy updated offsets out.
+ if inOffsetPtr != 0 {
+ if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil {
+ return 0, nil, err
+ }
+ }
+ if outOffsetPtr != 0 {
+ if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ if n == 0 {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, nil
+}
+
+// Tee implements Linux syscall tee(2).
+func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ inFD := args[0].Int()
+ outFD := args[1].Int()
+ count := int64(args[2].SizeT())
+ flags := args[3].Int()
+
+ if count == 0 {
+ return 0, nil, nil
+ }
+ if count > int64(kernel.MAX_RW_COUNT) {
+ count = int64(kernel.MAX_RW_COUNT)
+ }
+
+ // Check for invalid flags.
+ if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get file descriptions.
+ inFile := t.GetFileVFS2(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef()
+ outFile := t.GetFileVFS2(outFD)
+ if outFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer outFile.DecRef()
+
+ // Check that both files support the required directionality.
+ if !inFile.IsReadable() || !outFile.IsWritable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // The operation is non-blocking if anything is non-blocking.
+ //
+ // N.B. This is a rather simplistic heuristic that avoids some
+ // poor edge case behavior since the exact semantics here are
+ // underspecified and vary between versions of Linux itself.
+ nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+ // Both file descriptions must represent pipes.
+ inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+ outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+ if !inIsPipe || !outIsPipe {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Copy data.
+ var (
+ inCh chan struct{}
+ outCh chan struct{}
+ )
+ for {
+ n, err := pipe.Tee(t, outPipeFD, inPipeFD, count)
+ if n != 0 {
+ return uintptr(n), nil, nil
+ }
+ if err != syserror.ErrWouldBlock || nonBlock {
+ return 0, nil, err
+ }
+
+ // Note that the blocking behavior here is a bit different than the
+ // normal pattern. Because we need to have both data to read and data
+ // to write simultaneously, we actually explicitly block on both of
+ // these cases in turn before returning to the tee operation.
+ if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+ if inCh == nil {
+ inCh = make(chan struct{}, 1)
+ inW, _ := waiter.NewChannelEntry(inCh)
+ inFile.EventRegister(&inW, eventMaskRead)
+ defer inFile.EventUnregister(&inW)
+ continue // Need to refresh readiness.
+ }
+ if err := t.Block(inCh); err != nil {
+ return 0, nil, err
+ }
+ }
+ if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+ if outCh == nil {
+ outCh = make(chan struct{}, 1)
+ outW, _ := waiter.NewChannelEntry(outCh)
+ outFile.EventRegister(&outW, eventMaskWrite)
+ defer outFile.EventUnregister(&outW)
+ continue // Need to refresh readiness.
+ }
+ if err := t.Block(outCh); err != nil {
+ return 0, nil, err
+ }
+ }
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index a332d01bd..ef8358b8a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -116,9 +116,9 @@ func Override() {
s.Table[232] = syscalls.Supported("epoll_wait", EpollWait)
s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
s.Table[235] = syscalls.Supported("utimes", Utimes)
- delete(s.Table, 253) // inotify_init
- delete(s.Table, 254) // inotify_add_watch
- delete(s.Table, 255) // inotify_rm_watch
+ s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil)
+ s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
+ s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
s.Table[257] = syscalls.Supported("openat", Openat)
s.Table[258] = syscalls.Supported("mkdirat", Mkdirat)
s.Table[259] = syscalls.Supported("mknodat", Mknodat)
@@ -134,8 +134,8 @@ func Override() {
s.Table[269] = syscalls.Supported("faccessat", Faccessat)
s.Table[270] = syscalls.Supported("pselect", Pselect)
s.Table[271] = syscalls.Supported("ppoll", Ppoll)
- delete(s.Table, 275) // splice
- delete(s.Table, 276) // tee
+ s.Table[275] = syscalls.Supported("splice", Splice)
+ s.Table[276] = syscalls.Supported("tee", Tee)
s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange)
s.Table[280] = syscalls.Supported("utimensat", Utimensat)
s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
@@ -151,7 +151,7 @@ func Override() {
s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
s.Table[292] = syscalls.Supported("dup3", Dup3)
s.Table[293] = syscalls.Supported("pipe2", Pipe2)
- delete(s.Table, 294) // inotify_init1
+ s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil)
s.Table[295] = syscalls.Supported("preadv", Preadv)
s.Table[296] = syscalls.Supported("pwritev", Pwritev)
s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg)
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 94d69c1cc..774cc66cc 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -15,6 +15,18 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "event_list",
+ out = "event_list.go",
+ package = "vfs",
+ prefix = "event",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*Event",
+ "Linker": "*Event",
+ },
+)
+
go_library(
name = "vfs",
srcs = [
@@ -25,11 +37,13 @@ go_library(
"device.go",
"epoll.go",
"epoll_interest_list.go",
+ "event_list.go",
"file_description.go",
"file_description_impl_util.go",
"filesystem.go",
"filesystem_impl_util.go",
"filesystem_type.go",
+ "inotify.go",
"mount.go",
"mount_unsafe.go",
"options.go",
@@ -57,6 +71,7 @@ go_library(
"//pkg/sentry/limits",
"//pkg/sentry/memmap",
"//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/uniqueid",
"//pkg/sync",
"//pkg/syserror",
"//pkg/usermem",
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index caf770fd5..b7c6b60b8 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -297,3 +297,15 @@ func (d *anonDentry) TryIncRef() bool {
func (d *anonDentry) DecRef() {
// no-op
}
+
+// InotifyWithParent implements DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {}
+
+// Watches implements DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *anonDentry) Watches() *Watches {
+ return nil
+}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 8624dbd5d..24af13eb1 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -103,6 +103,22 @@ type DentryImpl interface {
// DecRef decrements the Dentry's reference count.
DecRef()
+
+ // InotifyWithParent notifies all watches on the targets represented by this
+ // dentry and its parent. The parent's watches are notified first, followed
+ // by this dentry's.
+ //
+ // InotifyWithParent automatically adds the IN_ISDIR flag for dentries
+ // representing directories.
+ //
+ // Note that the events may not actually propagate up to the user, depending
+ // on the event masks.
+ InotifyWithParent(events uint32, cookie uint32, et EventType)
+
+ // Watches returns the set of inotify watches for the file corresponding to
+ // the Dentry. Dentries that are hard links to the same underlying file
+ // share the same watches.
+ Watches() *Watches
}
// IncRef increments d's reference count.
@@ -133,6 +149,17 @@ func (d *Dentry) isMounted() bool {
return atomic.LoadUint32(&d.mounts) != 0
}
+// InotifyWithParent notifies all watches on the inodes for this dentry and
+// its parent of events.
+func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {
+ d.impl.InotifyWithParent(events, cookie, et)
+}
+
+// Watches returns the set of inotify watches associated with d.
+func (d *Dentry) Watches() *Watches {
+ return d.impl.Watches()
+}
+
// The following functions are exported so that filesystem implementations can
// use them. The vfs package, and users of VFS, should not call these
// functions.
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index cfabd936c..bb294563d 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -210,6 +210,11 @@ func (fd *FileDescription) VirtualDentry() VirtualDentry {
return fd.vd
}
+// Options returns the options passed to fd.Init().
+func (fd *FileDescription) Options() FileDescriptionOptions {
+ return fd.opts
+}
+
// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
func (fd *FileDescription) StatusFlags() uint32 {
return atomic.LoadUint32(&fd.statusFlags)
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
new file mode 100644
index 000000000..05a3051a4
--- /dev/null
+++ b/pkg/sentry/vfs/inotify.go
@@ -0,0 +1,697 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "bytes"
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/uniqueid"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// inotifyEventBaseSize is the base size of linux's struct inotify_event. This
+// must be a power 2 for rounding below.
+const inotifyEventBaseSize = 16
+
+// EventType defines different kinds of inotfiy events.
+//
+// The way events are labelled appears somewhat arbitrary, but they must match
+// Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+type EventType uint8
+
+// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
+// FSNOTIFY_EVENT_INODE in Linux.
+const (
+ PathEvent EventType = iota
+ InodeEvent EventType = iota
+)
+
+// Inotify represents an inotify instance created by inotify_init(2) or
+// inotify_init1(2). Inotify implements FileDescriptionImpl.
+//
+// Lock ordering:
+// Inotify.mu -> Watches.mu -> Inotify.evMu
+//
+// +stateify savable
+type Inotify struct {
+ vfsfd FileDescription
+ FileDescriptionDefaultImpl
+ DentryMetadataFileDescriptionImpl
+
+ // Unique identifier for this inotify instance. We don't just reuse the
+ // inotify fd because fds can be duped. These should not be exposed to the
+ // user, since we may aggressively reuse an id on S/R.
+ id uint64
+
+ // queue is used to notify interested parties when the inotify instance
+ // becomes readable or writable.
+ queue waiter.Queue `state:"nosave"`
+
+ // evMu *only* protects the events list. We need a separate lock while
+ // queuing events: using mu may violate lock ordering, since at that point
+ // the calling goroutine may already hold Watches.mu.
+ evMu sync.Mutex `state:"nosave"`
+
+ // A list of pending events for this inotify instance. Protected by evMu.
+ events eventList
+
+ // A scratch buffer, used to serialize inotify events. Allocate this
+ // ahead of time for the sake of performance. Protected by evMu.
+ scratch []byte
+
+ // mu protects the fields below.
+ mu sync.Mutex `state:"nosave"`
+
+ // nextWatchMinusOne is used to allocate watch descriptors on this Inotify
+ // instance. Note that Linux starts numbering watch descriptors from 1.
+ nextWatchMinusOne int32
+
+ // Map from watch descriptors to watch objects.
+ watches map[int32]*Watch
+}
+
+var _ FileDescriptionImpl = (*Inotify)(nil)
+
+// NewInotifyFD constructs a new Inotify instance.
+func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) {
+ // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs.
+ flags &^= linux.O_CLOEXEC
+ if flags&^linux.O_NONBLOCK != 0 {
+ return nil, syserror.EINVAL
+ }
+
+ id := uniqueid.GlobalFromContext(ctx)
+ vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id))
+ defer vd.DecRef()
+ fd := &Inotify{
+ id: id,
+ scratch: make([]byte, inotifyEventBaseSize),
+ watches: make(map[int32]*Watch),
+ }
+ if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+ UseDentryMetadata: true,
+ DenyPRead: true,
+ DenyPWrite: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release. Release removes all
+// watches and frees all resources for an inotify instance.
+func (i *Inotify) Release() {
+ // We need to hold i.mu to avoid a race with concurrent calls to
+ // Inotify.handleDeletion from Watches. There's no risk of Watches
+ // accessing this Inotify after the destructor ends, because we remove all
+ // references to it below.
+ i.mu.Lock()
+ defer i.mu.Unlock()
+ for _, w := range i.watches {
+ // Remove references to the watch from the watches set on the target. We
+ // don't need to worry about the references from i.watches, since this
+ // file description is about to be destroyed.
+ w.set.Remove(i.id)
+ }
+}
+
+// EventRegister implements waiter.Waitable.
+func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ i.queue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.
+func (i *Inotify) EventUnregister(e *waiter.Entry) {
+ i.queue.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// Readiness indicates whether there are pending events for an inotify instance.
+func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
+ ready := waiter.EventMask(0)
+
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+
+ if !i.events.Empty() {
+ ready |= waiter.EventIn
+ }
+
+ return mask & ready
+}
+
+// PRead implements FileDescriptionImpl.
+func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ return 0, syserror.ESPIPE
+}
+
+// PWrite implements FileDescriptionImpl.
+func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ return 0, syserror.ESPIPE
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ if dst.NumBytes() < inotifyEventBaseSize {
+ return 0, syserror.EINVAL
+ }
+
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+
+ if i.events.Empty() {
+ // Nothing to read yet, tell caller to block.
+ return 0, syserror.ErrWouldBlock
+ }
+
+ var writeLen int64
+ for it := i.events.Front(); it != nil; {
+ // Advance `it` before the element is removed from the list, or else
+ // it.Next() will always be nil.
+ event := it
+ it = it.Next()
+
+ // Does the buffer have enough remaining space to hold the event we're
+ // about to write out?
+ if dst.NumBytes() < int64(event.sizeOf()) {
+ if writeLen > 0 {
+ // Buffer wasn't big enough for all pending events, but we did
+ // write some events out.
+ return writeLen, nil
+ }
+ return 0, syserror.EINVAL
+ }
+
+ // Linux always dequeues an available event as long as there's enough
+ // buffer space to copy it out, even if the copy below fails. Emulate
+ // this behaviour.
+ i.events.Remove(event)
+
+ // Buffer has enough space, copy event to the read buffer.
+ n, err := event.CopyTo(ctx, i.scratch, dst)
+ if err != nil {
+ return 0, err
+ }
+
+ writeLen += n
+ dst = dst.DropFirst64(n)
+ }
+ return writeLen, nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ switch args[1].Int() {
+ case linux.FIONREAD:
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+ var n uint32
+ for e := i.events.Front(); e != nil; e = e.Next() {
+ n += uint32(e.sizeOf())
+ }
+ var buf [4]byte
+ usermem.ByteOrder.PutUint32(buf[:], n)
+ _, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
+ return 0, err
+
+ default:
+ return 0, syserror.ENOTTY
+ }
+}
+
+func (i *Inotify) queueEvent(ev *Event) {
+ i.evMu.Lock()
+
+ // Check if we should coalesce the event we're about to queue with the last
+ // one currently in the queue. Events are coalesced if they are identical.
+ if last := i.events.Back(); last != nil {
+ if ev.equals(last) {
+ // "Coalesce" the two events by simply not queuing the new one. We
+ // don't need to raise a waiter.EventIn notification because no new
+ // data is available for reading.
+ i.evMu.Unlock()
+ return
+ }
+ }
+
+ i.events.PushBack(ev)
+
+ // Release mutex before notifying waiters because we don't control what they
+ // can do.
+ i.evMu.Unlock()
+
+ i.queue.Notify(waiter.EventIn)
+}
+
+// newWatchLocked creates and adds a new watch to target.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) newWatchLocked(target *Dentry, mask uint32) *Watch {
+ targetWatches := target.Watches()
+ w := &Watch{
+ owner: i,
+ wd: i.nextWatchIDLocked(),
+ set: targetWatches,
+ mask: mask,
+ }
+
+ // Hold the watch in this inotify instance as well as the watch set on the
+ // target.
+ i.watches[w.wd] = w
+ targetWatches.Add(w)
+ return w
+}
+
+// newWatchIDLocked allocates and returns a new watch descriptor.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) nextWatchIDLocked() int32 {
+ i.nextWatchMinusOne++
+ return i.nextWatchMinusOne
+}
+
+// handleDeletion handles the deletion of the target of watch w. It removes w
+// from i.watches and a watch removal event is generated.
+func (i *Inotify) handleDeletion(w *Watch) {
+ i.mu.Lock()
+ _, found := i.watches[w.wd]
+ delete(i.watches, w.wd)
+ i.mu.Unlock()
+
+ if found {
+ i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0))
+ }
+}
+
+// AddWatch constructs a new inotify watch and adds it to the target. It
+// returns the watch descriptor returned by inotify_add_watch(2).
+func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
+ // Note: Locking this inotify instance protects the result returned by
+ // Lookup() below. With the lock held, we know for sure the lookup result
+ // won't become stale because it's impossible for *this* instance to
+ // add/remove watches on target.
+ i.mu.Lock()
+ defer i.mu.Unlock()
+
+ // Does the target already have a watch from this inotify instance?
+ if existing := target.Watches().Lookup(i.id); existing != nil {
+ newmask := mask
+ if mask&linux.IN_MASK_ADD != 0 {
+ // "Add (OR) events to watch mask for this pathname if it already
+ // exists (instead of replacing mask)." -- inotify(7)
+ newmask |= atomic.LoadUint32(&existing.mask)
+ }
+ atomic.StoreUint32(&existing.mask, newmask)
+ return existing.wd
+ }
+
+ // No existing watch, create a new watch.
+ w := i.newWatchLocked(target, mask)
+ return w.wd
+}
+
+// RmWatch looks up an inotify watch for the given 'wd' and configures the
+// target to stop sending events to this inotify instance.
+func (i *Inotify) RmWatch(wd int32) error {
+ i.mu.Lock()
+
+ // Find the watch we were asked to removed.
+ w, ok := i.watches[wd]
+ if !ok {
+ i.mu.Unlock()
+ return syserror.EINVAL
+ }
+
+ // Remove the watch from this instance.
+ delete(i.watches, wd)
+
+ // Remove the watch from the watch target.
+ w.set.Remove(w.OwnerID())
+ i.mu.Unlock()
+
+ // Generate the event for the removal.
+ i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0))
+
+ return nil
+}
+
+// Watches is the collection of all inotify watches on a single file.
+//
+// +stateify savable
+type Watches struct {
+ // mu protects the fields below.
+ mu sync.RWMutex `state:"nosave"`
+
+ // ws is the map of active watches in this collection, keyed by the inotify
+ // instance id of the owner.
+ ws map[uint64]*Watch
+}
+
+// Lookup returns the watch owned by an inotify instance with the given id.
+// Returns nil if no such watch exists.
+//
+// Precondition: the inotify instance with the given id must be locked to
+// prevent the returned watch from being concurrently modified or replaced in
+// Inotify.watches.
+func (w *Watches) Lookup(id uint64) *Watch {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+ return w.ws[id]
+}
+
+// Add adds watch into this set of watches.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Add(watch *Watch) {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+
+ owner := watch.OwnerID()
+ // Sanity check, we should never have two watches for one owner on the
+ // same target.
+ if _, exists := w.ws[owner]; exists {
+ panic(fmt.Sprintf("Watch collision with ID %+v", owner))
+ }
+ if w.ws == nil {
+ w.ws = make(map[uint64]*Watch)
+ }
+ w.ws[owner] = watch
+}
+
+// Remove removes a watch with the given id from this set of watches and
+// releases it. The caller is responsible for generating any watch removal
+// event, as appropriate. The provided id must match an existing watch in this
+// collection.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Remove(id uint64) {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+
+ if w.ws == nil {
+ // This watch set is being destroyed. The thread executing the
+ // destructor is already in the process of deleting all our watches. We
+ // got here with no references on the target because we raced with the
+ // destructor notifying all the watch owners of destruction. See the
+ // comment in Watches.HandleDeletion for why this race exists.
+ return
+ }
+
+ if _, ok := w.ws[id]; !ok {
+ // While there's technically no problem with silently ignoring a missing
+ // watch, this is almost certainly a bug.
+ panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id))
+ }
+ delete(w.ws, id)
+}
+
+// Notify queues a new event with all watches in this set.
+func (w *Watches) Notify(name string, events, cookie uint32, et EventType) {
+ w.NotifyWithExclusions(name, events, cookie, et, false)
+}
+
+// NotifyWithExclusions queues a new event with watches in this set. Watches
+// with IN_EXCL_UNLINK are skipped if the event is coming from a child that
+// has been unlinked.
+func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et EventType, unlinked bool) {
+ // N.B. We don't defer the unlocks because Notify is in the hot path of
+ // all IO operations, and the defer costs too much for small IO
+ // operations.
+ w.mu.RLock()
+ for _, watch := range w.ws {
+ if unlinked && watch.ExcludeUnlinkedChildren() && et == PathEvent {
+ continue
+ }
+ watch.Notify(name, events, cookie)
+ }
+ w.mu.RUnlock()
+}
+
+// HandleDeletion is called when the watch target is destroyed to emit
+// the appropriate events.
+func (w *Watches) HandleDeletion() {
+ w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent)
+
+ // TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied
+ // by value. Ideally, we wouldn't have this circular locking so we can just
+ // notify of IN_DELETE_SELF in the same loop below.
+ //
+ // We can't hold w.mu while calling watch.handleDeletion to preserve lock
+ // ordering w.r.t to the owner inotify instances. Instead, atomically move
+ // the watches map into a local variable so we can iterate over it safely.
+ //
+ // Because of this however, it is possible for the watches' owners to reach
+ // this inode while the inode has no refs. This is still safe because the
+ // owners can only reach the inode until this function finishes calling
+ // watch.handleDeletion below and the inode is guaranteed to exist in the
+ // meantime. But we still have to be very careful not to rely on inode state
+ // that may have been already destroyed.
+ var ws map[uint64]*Watch
+ w.mu.Lock()
+ ws = w.ws
+ w.ws = nil
+ w.mu.Unlock()
+
+ for _, watch := range ws {
+ // TODO(gvisor.dev/issue/1479): consider refactoring this.
+ watch.handleDeletion()
+ }
+}
+
+// Watch represent a particular inotify watch created by inotify_add_watch.
+//
+// +stateify savable
+type Watch struct {
+ // Inotify instance which owns this watch.
+ owner *Inotify
+
+ // Descriptor for this watch. This is unique across an inotify instance.
+ wd int32
+
+ // set is the watch set containing this watch. It belongs to the target file
+ // of this watch.
+ set *Watches
+
+ // Events being monitored via this watch. Must be accessed with atomic
+ // memory operations.
+ mask uint32
+}
+
+// OwnerID returns the id of the inotify instance that owns this watch.
+func (w *Watch) OwnerID() uint64 {
+ return w.owner.id
+}
+
+// ExcludeUnlinkedChildren indicates whether the watched object should continue
+// to be notified of events of its children after they have been unlinked, e.g.
+// for an open file descriptor.
+//
+// TODO(gvisor.dev/issue/1479): Implement IN_EXCL_UNLINK.
+// We can do this by keeping track of the set of unlinked children in Watches
+// to skip notification.
+func (w *Watch) ExcludeUnlinkedChildren() bool {
+ return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0
+}
+
+// Notify queues a new event on this watch.
+func (w *Watch) Notify(name string, events uint32, cookie uint32) {
+ mask := atomic.LoadUint32(&w.mask)
+ if mask&events == 0 {
+ // We weren't watching for this event.
+ return
+ }
+
+ // Event mask should include bits matched from the watch plus all control
+ // event bits.
+ unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
+ effectiveMask := unmaskableBits | mask
+ matchedEvents := effectiveMask & events
+ w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
+}
+
+// handleDeletion handles the deletion of w's target.
+func (w *Watch) handleDeletion() {
+ w.owner.handleDeletion(w)
+}
+
+// Event represents a struct inotify_event from linux.
+//
+// +stateify savable
+type Event struct {
+ eventEntry
+
+ wd int32
+ mask uint32
+ cookie uint32
+
+ // len is computed based on the name field is set automatically by
+ // Event.setName. It should be 0 when no name is set; otherwise it is the
+ // length of the name slice.
+ len uint32
+
+ // The name field has special padding requirements and should only be set by
+ // calling Event.setName.
+ name []byte
+}
+
+func newEvent(wd int32, name string, events, cookie uint32) *Event {
+ e := &Event{
+ wd: wd,
+ mask: events,
+ cookie: cookie,
+ }
+ if name != "" {
+ e.setName(name)
+ }
+ return e
+}
+
+// paddedBytes converts a go string to a null-terminated c-string, padded with
+// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes
+// in the 's' plus at least one null byte.
+func paddedBytes(s string, l uint32) []byte {
+ if l < uint32(len(s)+1) {
+ panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!")
+ }
+ b := make([]byte, l)
+ copy(b, s)
+
+ // b was zero-value initialized during make(), so the rest of the slice is
+ // already filled with null bytes.
+
+ return b
+}
+
+// setName sets the optional name for this event.
+func (e *Event) setName(name string) {
+ // We need to pad the name such that the entire event length ends up a
+ // multiple of inotifyEventBaseSize.
+ unpaddedLen := len(name) + 1
+ // Round up to nearest multiple of inotifyEventBaseSize.
+ e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1))
+ // Make sure we haven't overflowed and wrapped around when rounding.
+ if unpaddedLen > int(e.len) {
+ panic("Overflow when rounding inotify event size, the 'name' field was too big.")
+ }
+ e.name = paddedBytes(name, e.len)
+}
+
+func (e *Event) sizeOf() int {
+ s := inotifyEventBaseSize + int(e.len)
+ if s < inotifyEventBaseSize {
+ panic("overflow")
+ }
+ return s
+}
+
+// CopyTo serializes this event to dst. buf is used as a scratch buffer to
+// construct the output. We use a buffer allocated ahead of time for
+// performance. buf must be at least inotifyEventBaseSize bytes.
+func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) {
+ usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd))
+ usermem.ByteOrder.PutUint32(buf[4:], e.mask)
+ usermem.ByteOrder.PutUint32(buf[8:], e.cookie)
+ usermem.ByteOrder.PutUint32(buf[12:], e.len)
+
+ writeLen := 0
+
+ n, err := dst.CopyOut(ctx, buf)
+ if err != nil {
+ return 0, err
+ }
+ writeLen += n
+ dst = dst.DropFirst(n)
+
+ if e.len > 0 {
+ n, err = dst.CopyOut(ctx, e.name)
+ if err != nil {
+ return 0, err
+ }
+ writeLen += n
+ }
+
+ // Santiy check.
+ if writeLen != e.sizeOf() {
+ panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen))
+ }
+
+ return int64(writeLen), nil
+}
+
+func (e *Event) equals(other *Event) bool {
+ return e.wd == other.wd &&
+ e.mask == other.mask &&
+ e.cookie == other.cookie &&
+ e.len == other.len &&
+ bytes.Equal(e.name, other.name)
+}
+
+// InotifyEventFromStatMask generates the appropriate events for an operation
+// that set the stats specified in mask.
+func InotifyEventFromStatMask(mask uint32) uint32 {
+ var ev uint32
+ if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 {
+ ev |= linux.IN_ATTRIB
+ }
+ if mask&linux.STATX_SIZE != 0 {
+ ev |= linux.IN_MODIFY
+ }
+
+ if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) {
+ // Both times indicates a utime(s) call.
+ ev |= linux.IN_ATTRIB
+ } else if mask&linux.STATX_ATIME != 0 {
+ ev |= linux.IN_ACCESS
+ } else if mask&linux.STATX_MTIME != 0 {
+ mask |= linux.IN_MODIFY
+ }
+ return ev
+}
+
+// InotifyRemoveChild sends the appriopriate notifications to the watch sets of
+// the child being removed and its parent.
+func InotifyRemoveChild(self, parent *Watches, name string) {
+ self.Notify("", linux.IN_ATTRIB, 0, InodeEvent)
+ parent.Notify(name, linux.IN_DELETE, 0, InodeEvent)
+ // TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK.
+}
+
+// InotifyRename sends the appriopriate notifications to the watch sets of the
+// file being renamed and its old/new parents.
+func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) {
+ var dirEv uint32
+ if isDir {
+ dirEv = linux.IN_ISDIR
+ }
+ cookie := uniqueid.InotifyCookie(ctx)
+ oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent)
+ newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent)
+ // Somewhat surprisingly, self move events do not have a cookie.
+ renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent)
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 02850b65c..e4ac6524b 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -28,9 +28,6 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
-// lastMountID is used to allocate mount ids. Must be accessed atomically.
-var lastMountID uint64
-
// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
// (Mount.fs), which applies to path resolution in the context of a particular
@@ -97,7 +94,7 @@ type Mount struct {
func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
mnt := &Mount{
- ID: atomic.AddUint64(&lastMountID, 1),
+ ID: atomic.AddUint64(&vfs.lastMountID, 1),
vfs: vfs,
fs: fs,
root: root,
@@ -111,6 +108,16 @@ func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *Mount
return mnt
}
+// Options returns a copy of the MountOptions currently applicable to mnt.
+func (mnt *Mount) Options() MountOptions {
+ mnt.vfs.mountMu.Lock()
+ defer mnt.vfs.mountMu.Unlock()
+ return MountOptions{
+ Flags: mnt.flags,
+ ReadOnly: mnt.readOnly(),
+ }
+}
+
// A MountNamespace is a collection of Mounts.
//
// MountNamespaces are reference-counted. Unless otherwise specified, all
@@ -148,7 +155,7 @@ type MountNamespace struct {
func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
rft := vfs.getFilesystemType(fsTypeName)
if rft == nil {
- ctx.Warningf("Unknown filesystem: %s", fsTypeName)
+ ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
return nil, syserror.ENODEV
}
fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
@@ -175,26 +182,34 @@ func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry,
return newMount(vfs, fs, root, nil /* mntns */, opts), nil
}
-// MountAt creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+// MountDisconnected creates a Filesystem configured by the given arguments,
+// then returns a Mount representing it. The new Mount is not associated with
+// any MountNamespace and is not connected to any other Mounts.
+func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
rft := vfs.getFilesystemType(fsTypeName)
if rft == nil {
- return syserror.ENODEV
+ return nil, syserror.ENODEV
}
if !opts.InternalMount && !rft.opts.AllowUserMount {
- return syserror.ENODEV
+ return nil, syserror.ENODEV
}
fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
if err != nil {
- return err
+ return nil, err
}
+ defer root.DecRef()
+ defer fs.DecRef()
+ return vfs.NewDisconnectedMount(fs, root, opts)
+}
+// ConnectMountAt connects mnt at the path represented by target.
+//
+// Preconditions: mnt must be disconnected.
+func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
// lock ordering.
vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
if err != nil {
- root.DecRef()
- fs.DecRef()
return err
}
vfs.mountMu.Lock()
@@ -204,8 +219,6 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
vd.dentry.mu.Unlock()
vfs.mountMu.Unlock()
vd.DecRef()
- root.DecRef()
- fs.DecRef()
return syserror.ENOENT
}
// vd might have been mounted over between vfs.GetDentryAt() and
@@ -238,7 +251,6 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
// point and the mount root are directories, or neither are, and returns
// ENOTDIR if this is not the case.
mntns := vd.mount.ns
- mnt := newMount(vfs, fs, root, mntns, opts)
vfs.mounts.seq.BeginWrite()
vfs.connectLocked(mnt, vd, mntns)
vfs.mounts.seq.EndWrite()
@@ -247,6 +259,19 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
return nil
}
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+ mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
+ if err != nil {
+ return err
+ }
+ if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
+ mnt.DecRef()
+ return err
+ }
+ return nil
+}
+
// UmountAt removes the Mount at the given path.
func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
@@ -377,6 +402,7 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
}
vd.mount.children[mnt] = struct{}{}
atomic.AddUint32(&vd.dentry.mounts, 1)
+ mnt.ns = mntns
mntns.mountpoints[vd.dentry]++
vfs.mounts.insertSeqed(mnt)
vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 8d7f8f8af..52643a7c5 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -82,6 +82,10 @@ type VirtualFilesystem struct {
// mountpoints is analogous to Linux's mountpoint_hashtable.
mountpoints map[*Dentry]map[*Mount]struct{}
+ // lastMountID is the last allocated mount ID. lastMountID is accessed
+ // using atomic memory operations.
+ lastMountID uint64
+
// anonMount is a Mount, not included in mounts or mountpoints,
// representing an anonFilesystem. anonMount is used to back
// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
@@ -418,6 +422,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
}
}
+ fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent)
return fd, nil
}
if !rp.handleError(err) {
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 9bf67686d..5eb78b398 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -187,7 +187,7 @@ func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack
// InjectLinkAddr injects an inbound packet with a remote link address.
func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt stack.PacketBuffer) {
- e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
+ e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
}
// Attach saves the stack network-layer dispatcher for use later when packets
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index affa1bbdf..5ee508d48 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -642,7 +642,7 @@ func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
// InjectInbound injects an inbound packet.
func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
- e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, pkt)
+ e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
}
// NewInjectable creates a new fd-based InjectableEndpoint.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 3bfb15a8e..6f41a71a8 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -103,7 +103,7 @@ func (c *context) cleanup() {
}
}
-func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (c *context) DeliverNetworkPacket(remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
c.ch <- packetInfo{remote, protocol, pkt}
}
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index fe2bf3b0b..ca4229ed6 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -191,7 +191,7 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
}
pkt = pkt[d.e.hdrSize:]
- d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, stack.PacketBuffer{
+ d.e.dispatcher.DeliverNetworkPacket(remote, local, p, stack.PacketBuffer{
Data: buffer.View(pkt).ToVectorisedView(),
LinkHeader: buffer.View(eth),
})
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
index cb4cbea69..26c96a655 100644
--- a/pkg/tcpip/link/fdbased/packet_dispatchers.go
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -145,7 +145,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
}
pkt.Data.TrimFront(d.e.hdrSize)
- d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
+ d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
// Prepare e.views for another packet: release used views.
for i := 0; i < used; i++ {
@@ -169,7 +169,7 @@ type recvMMsgDispatcher struct {
// iovecs is an array of array of iovec records where each iovec base
// pointer and length are initialzed to the corresponding view above,
- // except when GSO is neabled then the first iovec in each array of
+ // except when GSO is enabled then the first iovec in each array of
// iovecs points to a buffer for the vnet header which is stripped
// before the views are passed up the stack for further processing.
iovecs [][]syscall.Iovec
@@ -301,7 +301,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
LinkHeader: buffer.View(eth),
}
pkt.Data.TrimFront(d.e.hdrSize)
- d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
+ d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
// Prepare e.views for another packet: release used views.
for i := 0; i < used; i++ {
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 073c84ef9..20d9e95f6 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -84,7 +84,7 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Netw
// Because we're immediately turning around and writing the packet back
// to the rx path, we intentionally don't preserve the remote and local
// link addresses from the stack.Route we're passed.
- e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, stack.PacketBuffer{
+ e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, stack.PacketBuffer{
Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
})
@@ -106,7 +106,7 @@ func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
}
linkHeader := header.Ethernet(hdr)
vv.TrimFront(len(linkHeader))
- e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), stack.PacketBuffer{
+ e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, linkHeader.Type(), stack.PacketBuffer{
Data: vv,
LinkHeader: buffer.View(linkHeader),
})
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index a5478ce17..f0769830a 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -81,7 +81,7 @@ func (m *InjectableEndpoint) IsAttached() bool {
// InjectInbound implements stack.InjectableLinkEndpoint.
func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
- m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, pkt)
+ m.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
}
// WritePackets writes outbound packets to the appropriate
diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go
index 54432194d..ec5c5048a 100644
--- a/pkg/tcpip/link/qdisc/fifo/endpoint.go
+++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go
@@ -102,8 +102,8 @@ func (q *queueDispatcher) dispatchLoop() {
}
// DeliverNetworkPacket implements stack.NetworkDispatcher.DeliverNetworkPacket.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
- e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+ e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
}
// Attach implements stack.LinkEndpoint.Attach.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 0796d717e..f5dec0a7f 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -275,7 +275,7 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
// Send packet up the stack.
eth := header.Ethernet(b[:header.EthernetMinimumSize])
- d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), stack.PacketBuffer{
+ d.DeliverNetworkPacket(eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), stack.PacketBuffer{
Data: buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(),
LinkHeader: buffer.View(eth),
})
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 33f640b85..f3fc62607 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -131,7 +131,7 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress
return c
}
-func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (c *testContext) DeliverNetworkPacket(remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
c.mu.Lock()
c.packets = append(c.packets, packetInfo{
addr: remoteLinkAddr,
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index da1c520ae..b060d4627 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -120,9 +120,9 @@ func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) (
// DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
// called by the link-layer endpoint being wrapped when a packet arrives, and
// logs the packet before forwarding to the actual dispatcher.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
e.dumpPacket("recv", nil, protocol, &pkt)
- e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
+ e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
}
// Attach implements the stack.LinkEndpoint interface. It saves the dispatcher
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index 2b3741276..f5a05929f 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -50,12 +50,12 @@ func New(lower stack.LinkEndpoint) *Endpoint {
// It is called by the link-layer endpoint being wrapped when a packet arrives,
// and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
// been called.
-func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
if !e.dispatchGate.Enter() {
return
}
- e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
+ e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
e.dispatchGate.Leave()
}
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 54eb5322b..0a9b99f18 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -35,7 +35,7 @@ type countedEndpoint struct {
dispatcher stack.NetworkDispatcher
}
-func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (e *countedEndpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
e.dispatchCount++
}
@@ -120,21 +120,21 @@ func TestWaitDispatch(t *testing.T) {
}
// Dispatch and check that it goes through.
- ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
+ ep.dispatcher.DeliverNetworkPacket("", "", 0, stack.PacketBuffer{})
if want := 1; ep.dispatchCount != want {
t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
}
// Wait on writes, then try to dispatch. It must go through.
wep.WaitWrite()
- ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
+ ep.dispatcher.DeliverNetworkPacket("", "", 0, stack.PacketBuffer{})
if want := 2; ep.dispatchCount != want {
t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
}
// Wait on dispatches, then try to dispatch. It must not go through.
wep.WaitDispatch()
- ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
+ ep.dispatcher.DeliverNetworkPacket("", "", 0, stack.PacketBuffer{})
if want := 2; ep.dispatchCount != want {
t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
}
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index 8084d50bc..344d60baa 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -209,7 +209,7 @@ func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber
// InjectLinkAddr injects an inbound packet with a remote link address.
func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt PacketBuffer) {
- e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
+ e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
}
// Attach saves the stack network-layer dispatcher for use later when packets
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 54103fdb3..9213cbb9a 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1167,7 +1167,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
// Note that the ownership of the slice backing vv is retained by the caller.
// This rule applies only to the slice itself, not to the items of the slice;
// the ownership of the items is not retained by the caller.
-func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
+func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
n.mu.RLock()
enabled := n.mu.enabled
// If the NIC is not yet enabled, don't receive any packets.
@@ -1240,7 +1240,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
}
if ref := n.getRef(protocol, dst); ref != nil {
- handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, pkt)
+ handlePacket(protocol, dst, src, n.linkEP.LinkAddress(), remote, ref, pkt)
return
}
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
index d672fc157..b01b3f476 100644
--- a/pkg/tcpip/stack/nic_test.go
+++ b/pkg/tcpip/stack/nic_test.go
@@ -44,7 +44,7 @@ func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
t.FailNow()
}
- nic.DeliverNetworkPacket(nil, "", "", 0, PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
+ nic.DeliverNetworkPacket("", "", 0, PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
if got := nic.stats.DisabledRx.Packets.Value(); got != 1 {
t.Errorf("got DisabledRx.Packets = %d, want = 1", got)
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index b331427c6..a25657cd7 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -326,7 +326,7 @@ type NetworkDispatcher interface {
// packets sent via loopback), and won't have the field set.
//
// DeliverNetworkPacket takes ownership of pkt.
- DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer)
+ DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer)
}
// LinkEndpointCapabilities is the type associated with the capabilities
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index a7e088d4e..e4a06c9e1 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1347,6 +1347,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
e.setEndpointState(StateError)
e.HardError = err
+ e.workerCleanup = true
// Lock released below.
epilogue()
return err
diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go
index 06f81d28d..c45d2ecbc 100644
--- a/pkg/test/dockerutil/dockerutil.go
+++ b/pkg/test/dockerutil/dockerutil.go
@@ -148,6 +148,62 @@ func (m MountMode) String() string {
panic(fmt.Sprintf("invalid mode: %d", m))
}
+// DockerNetwork contains the name of a docker network.
+type DockerNetwork struct {
+ logger testutil.Logger
+ Name string
+ Subnet *net.IPNet
+ containers []*Docker
+}
+
+// NewDockerNetwork sets up the struct for a Docker network. Names of networks
+// will be unique.
+func NewDockerNetwork(logger testutil.Logger) *DockerNetwork {
+ return &DockerNetwork{
+ logger: logger,
+ Name: testutil.RandomID(logger.Name()),
+ }
+}
+
+// Create calls 'docker network create'.
+func (n *DockerNetwork) Create(args ...string) error {
+ a := []string{"docker", "network", "create"}
+ if n.Subnet != nil {
+ a = append(a, fmt.Sprintf("--subnet=%s", n.Subnet))
+ }
+ a = append(a, args...)
+ a = append(a, n.Name)
+ return testutil.Command(n.logger, a...).Run()
+}
+
+// Connect calls 'docker network connect' with the arguments provided.
+func (n *DockerNetwork) Connect(container *Docker, args ...string) error {
+ a := []string{"docker", "network", "connect"}
+ a = append(a, args...)
+ a = append(a, n.Name, container.Name)
+ if err := testutil.Command(n.logger, a...).Run(); err != nil {
+ return err
+ }
+ n.containers = append(n.containers, container)
+ return nil
+}
+
+// Cleanup cleans up the docker network and all the containers attached to it.
+func (n *DockerNetwork) Cleanup() error {
+ for _, c := range n.containers {
+ // Don't propagate the error, it might be that the container
+ // was already cleaned up.
+ if err := c.Kill(); err != nil {
+ n.logger.Logf("unable to kill container during cleanup: %s", err)
+ }
+ }
+
+ if err := testutil.Command(n.logger, "docker", "network", "rm", n.Name).Run(); err != nil {
+ return err
+ }
+ return nil
+}
+
// Docker contains the name and the runtime of a docker container.
type Docker struct {
logger testutil.Logger
@@ -313,7 +369,9 @@ func (d *Docker) argsFor(r *RunOpts, command string, p []string) (rv []string) {
rv = append(rv, d.Name)
} else {
rv = append(rv, d.mounts...)
- rv = append(rv, fmt.Sprintf("--runtime=%s", d.Runtime))
+ if len(d.Runtime) > 0 {
+ rv = append(rv, fmt.Sprintf("--runtime=%s", d.Runtime))
+ }
rv = append(rv, fmt.Sprintf("--name=%s", d.Name))
rv = append(rv, testutil.ImageByName(r.Image))
}
@@ -481,6 +539,56 @@ func (d *Docker) FindIP() (net.IP, error) {
return ip, nil
}
+// A NetworkInterface is container's network interface information.
+type NetworkInterface struct {
+ IPv4 net.IP
+ MAC net.HardwareAddr
+}
+
+// ListNetworks returns the network interfaces of the container, keyed by
+// Docker network name.
+func (d *Docker) ListNetworks() (map[string]NetworkInterface, error) {
+ const format = `{{json .NetworkSettings.Networks}}`
+ out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput()
+ if err != nil {
+ return nil, fmt.Errorf("error network interfaces: %q: %w", string(out), err)
+ }
+
+ networks := map[string]map[string]string{}
+ if err := json.Unmarshal(out, &networks); err != nil {
+ return nil, fmt.Errorf("error decoding network interfaces: %w", err)
+ }
+
+ interfaces := map[string]NetworkInterface{}
+ for name, iface := range networks {
+ var netface NetworkInterface
+
+ rawIP := strings.TrimSpace(iface["IPAddress"])
+ if rawIP != "" {
+ ip := net.ParseIP(rawIP)
+ if ip == nil {
+ return nil, fmt.Errorf("invalid IP: %q", rawIP)
+ }
+ // Docker's IPAddress field is IPv4. The IPv6 address
+ // is stored in the GlobalIPv6Address field.
+ netface.IPv4 = ip
+ }
+
+ rawMAC := strings.TrimSpace(iface["MacAddress"])
+ if rawMAC != "" {
+ mac, err := net.ParseMAC(rawMAC)
+ if err != nil {
+ return nil, fmt.Errorf("invalid MAC: %q: %w", rawMAC, err)
+ }
+ netface.MAC = mac
+ }
+
+ interfaces[name] = netface
+ }
+
+ return interfaces, nil
+}
+
// SandboxPid returns the PID to the sandbox process.
func (d *Docker) SandboxPid() (int, error) {
out, err := testutil.Command(d.logger, "docker", "inspect", "-f={{.State.Pid}}", d.Name).CombinedOutput()
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e1181271a..9cd4f97da 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -221,9 +221,6 @@ func mountFlags(opts []string) fs.MountSourceFlags {
mf.NoAtime = true
case "noexec":
mf.NoExec = true
- case "bind", "rbind":
- // When options include either "bind" or "rbind",
- // it's converted to a 9P mount.
default:
log.Warningf("ignoring unknown mount option %q", o)
}
@@ -770,10 +767,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
useOverlay bool
)
- if isBindMount(m) {
- m.Type = bind
- }
-
switch m.Type {
case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name:
fsName = m.Type
@@ -801,18 +794,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
return fsName, opts, useOverlay, nil
}
-func isBindMount(m specs.Mount) bool {
- for _, opt := range m.Options {
- // When options include either "bind" or "rbind", this behaves as
- // bind mount even if the mount type is equal to a filesystem supported
- // on runsc.
- if opt == "bind" || opt == "rbind" {
- return true
- }
- }
- return false
-}
-
func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
if hint := c.hints.findMount(mount); hint != nil {
return hint.fileAccessType()
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 147c901c4..fefeeff58 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -235,7 +235,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
fd := -1
// Only bind mounts use host FDs; see
// containerMounter.getMountNameAndOptionsVFS2.
- if m.Type == bind || isBindMount(m) {
+ if m.Type == bind {
fd = c.fds.remove()
}
mounts = append(mounts, mountAndFD{
@@ -305,10 +305,6 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
useOverlay bool
)
- if isBindMount(m.Mount) {
- m.Type = bind
- }
-
switch m.Type {
case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
fsName = m.Type
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index d4c7bdfbb..c087e1a3c 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -7,8 +7,8 @@ go_library(
srcs = ["cgroup.go"],
visibility = ["//:sandbox"],
deps = [
+ "//pkg/cleanup",
"//pkg/log",
- "//runsc/specutils",
"@com_github_cenkalti_backoff//:go_default_library",
"@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
],
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 19c8b0db6..ef01820ef 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -31,8 +31,8 @@ import (
"github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/runsc/specutils"
)
const (
@@ -246,7 +246,7 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
// The Cleanup object cleans up partially created cgroups when an error occurs.
// Errors occuring during cleanup itself are ignored.
- clean := specutils.MakeCleanup(func() { _ = c.Uninstall() })
+ clean := cleanup.Make(func() { _ = c.Uninstall() })
defer clean.Clean()
for key, cfg := range controllers {
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 46154df60..9a856d65c 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -16,6 +16,7 @@ go_library(
],
deps = [
"//pkg/abi/linux",
+ "//pkg/cleanup",
"//pkg/log",
"//pkg/sentry/control",
"//pkg/sentry/sighandling",
@@ -53,6 +54,7 @@ go_test(
deps = [
"//pkg/abi/linux",
"//pkg/bits",
+ "//pkg/cleanup",
"//pkg/log",
"//pkg/sentry/control",
"//pkg/sentry/kernel",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 8539f252d..6d297d0df 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -31,6 +31,7 @@ import (
"github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/sighandling"
@@ -293,7 +294,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
}
// The Cleanup object cleans up partially created containers when an error
// occurs. Any errors occurring during cleanup itself are ignored.
- cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
+ cu := cleanup.Make(func() { _ = c.Destroy() })
defer cu.Clean()
// Lock the container metadata file to prevent concurrent creations of
@@ -402,7 +403,7 @@ func (c *Container) Start(conf *boot.Config) error {
if err := c.Saver.lock(); err != nil {
return err
}
- unlock := specutils.MakeCleanup(func() { c.Saver.unlock() })
+ unlock := cleanup.Make(func() { c.Saver.unlock() })
defer unlock.Clean()
if err := c.requireStatus("start", Created); err != nil {
@@ -506,7 +507,7 @@ func Run(conf *boot.Config, args Args) (syscall.WaitStatus, error) {
}
// Clean up partially created container if an error occurs.
// Any errors returned by Destroy() itself are ignored.
- cu := specutils.MakeCleanup(func() {
+ cu := cleanup.Make(func() {
c.Destroy()
})
defer cu.Clean()
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 1a6d50d0d..d59a1d97e 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1537,28 +1537,6 @@ func TestReadonlyMount(t *testing.T) {
}
}
-func TestBindMountByOption(t *testing.T) {
- for _, conf := range configs(t, overlay) {
- t.Logf("Running test with conf: %+v", conf)
-
- dir, err := ioutil.TempDir(testutil.TmpDir(), "bind-mount")
- spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
- if err != nil {
- t.Fatalf("ioutil.TempDir() failed: %v", err)
- }
- spec.Mounts = append(spec.Mounts, specs.Mount{
- Destination: dir,
- Source: dir,
- Type: "none",
- Options: []string{"rw", "bind"},
- })
-
- if err := run(spec, conf); err != nil {
- t.Fatalf("error running sandbox: %v", err)
- }
- }
-}
-
// TestAbbreviatedIDs checks that runsc supports using abbreviated container
// IDs in place of full IDs.
func TestAbbreviatedIDs(t *testing.T) {
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index f6861b1dd..dc825abd9 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -27,6 +27,7 @@ import (
"time"
specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
@@ -64,29 +65,16 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
}
- var (
- containers []*Container
- cleanups []func()
- )
- cleanups = append(cleanups, func() {
- for _, c := range containers {
- c.Destroy()
- }
- })
- cleanupAll := func() {
- for _, c := range cleanups {
- c()
- }
- }
- localClean := specutils.MakeCleanup(cleanupAll)
- defer localClean.Clean()
+ cu := cleanup.Cleanup{}
+ defer cu.Clean()
+ var containers []*Container
for i, spec := range specs {
bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
if err != nil {
return nil, nil, fmt.Errorf("error setting up container: %v", err)
}
- cleanups = append(cleanups, cleanup)
+ cu.Add(cleanup)
args := Args{
ID: ids[i],
@@ -97,6 +85,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
if err != nil {
return nil, nil, fmt.Errorf("error creating container: %v", err)
}
+ cu.Add(func() { cont.Destroy() })
containers = append(containers, cont)
if err := cont.Start(conf); err != nil {
@@ -104,8 +93,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
}
}
- localClean.Release()
- return containers, cleanupAll, nil
+ return containers, cu.Release(), nil
}
type execDesc struct {
@@ -1394,7 +1382,7 @@ func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
Destination: "/mydir/test",
Source: "/some/dir",
Type: "tmpfs",
- Options: []string{"rw", "relatime"},
+ Options: []string{"rw", "rbind", "relatime"},
}
podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 64a406ae2..1036b0630 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -13,12 +13,12 @@ go_library(
visibility = ["//runsc:__subpackages__"],
deps = [
"//pkg/abi/linux",
+ "//pkg/cleanup",
"//pkg/fd",
"//pkg/log",
"//pkg/p9",
"//pkg/sync",
"//pkg/syserr",
- "//runsc/specutils",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 1942f50d7..edc239013 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -33,11 +33,11 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/runsc/specutils"
)
const (
@@ -439,7 +439,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
if err != nil {
return nil, nil, p9.QID{}, 0, extractErrno(err)
}
- cu := specutils.MakeCleanup(func() {
+ cu := cleanup.Make(func() {
child.Close()
// Best effort attempt to remove the file in case of failure.
if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
@@ -480,7 +480,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
return p9.QID{}, extractErrno(err)
}
- cu := specutils.MakeCleanup(func() {
+ cu := cleanup.Make(func() {
// Best effort attempt to remove the dir in case of failure.
if err := unix.Unlinkat(l.file.FD(), name, unix.AT_REMOVEDIR); err != nil {
log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
@@ -864,7 +864,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil {
return p9.QID{}, extractErrno(err)
}
- cu := specutils.MakeCleanup(func() {
+ cu := cleanup.Make(func() {
// Best effort attempt to remove the symlink in case of failure.
if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index c95d50294..035dcd3e3 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -13,6 +13,7 @@ go_library(
"//runsc:__subpackages__",
],
deps = [
+ "//pkg/cleanup",
"//pkg/control/client",
"//pkg/control/server",
"//pkg/log",
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index e4ec16e2f..6e1a2af25 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -30,6 +30,7 @@ import (
"github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/syndtr/gocapability/capability"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/control/client"
"gvisor.dev/gvisor/pkg/control/server"
"gvisor.dev/gvisor/pkg/log"
@@ -119,7 +120,7 @@ func New(conf *boot.Config, args *Args) (*Sandbox, error) {
s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup}
// The Cleanup object cleans up partially created sandboxes when an error
// occurs. Any errors occurring during cleanup itself are ignored.
- c := specutils.MakeCleanup(func() {
+ c := cleanup.Make(func() {
err := s.destroy()
log.Warningf("error destroying sandbox: %v", err)
})
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 202518b58..f1fa573c5 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -311,19 +311,7 @@ func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.
// Is9PMount returns true if the given mount can be mounted as an external gofer.
func Is9PMount(m specs.Mount) bool {
- var isBind bool
- switch m.Type {
- case "bind":
- isBind = true
- default:
- for _, opt := range m.Options {
- if opt == "bind" || opt == "rbind" {
- isBind = true
- break
- }
- }
- }
- return isBind && m.Source != "" && IsSupportedDevMount(m)
+ return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m)
}
// IsSupportedDevMount returns true if the mount is a supported /dev mount.
@@ -456,36 +444,6 @@ func ContainsStr(strs []string, str string) bool {
return false
}
-// Cleanup allows defers to be aborted when cleanup needs to happen
-// conditionally. Usage:
-// c := MakeCleanup(func() { f.Close() })
-// defer c.Clean() // any failure before release is called will close the file.
-// ...
-// c.Release() // on success, aborts closing the file and return it.
-// return f
-type Cleanup struct {
- clean func()
-}
-
-// MakeCleanup creates a new Cleanup object.
-func MakeCleanup(f func()) Cleanup {
- return Cleanup{clean: f}
-}
-
-// Clean calls the cleanup function.
-func (c *Cleanup) Clean() {
- if c.clean != nil {
- c.clean()
- c.clean = nil
- }
-}
-
-// Release releases the cleanup from its duties, i.e. cleanup function is not
-// called after this point.
-func (c *Cleanup) Release() {
- c.clean = nil
-}
-
// RetryEintr retries the function until an error different than EINTR is
// returned.
func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
diff --git a/scripts/build.sh b/scripts/build.sh
deleted file mode 100755
index e821e8624..000000000
--- a/scripts/build.sh
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-# Build runsc.
-runsc=$(build -c opt //runsc)
-
-# Build packages.
-pkgs=$(build -c opt //runsc:runsc-debian)
-
-# Stop here if we have no artifacts directory.
-[[ -v KOKORO_ARTIFACTS_DIR ]] || exit 0
-
-# install_raw installs raw artifacts.
-install_raw() {
- mkdir -p "$1"
- cp -f "${runsc}" "$1"/runsc
- sha512sum "$1"/runsc | awk '{print $1 " runsc"}' > "$1"/runsc.sha512
-}
-
-# Build a repository, if the key is available.
-#
-# Note that make_repository.sh script will install packages into the provided
-# root, but will output to stdout a directory that can be copied arbitrarily
-# into "${KOKORO_ARTIFACTS_DIR}"/dists/XXX. We do things this way because we
-# will copy the same repository structure into multiple locations, below.
-if [[ -v KOKORO_REPO_KEY ]]; then
- repo=$(tools/make_repository.sh \
- "${KOKORO_KEYSTORE_DIR}/${KOKORO_REPO_KEY}" \
- gvisor-bot@google.com \
- "${KOKORO_ARTIFACTS_DIR}" \
- ${pkgs})
-fi
-
-# install_repo installs a repository.
-#
-# Note that packages are already installed, as noted above.
-install_repo() {
- if [[ -v repo ]]; then
- rm -rf "$1" && mkdir -p "$(dirname "$1")" && cp -a "${repo}" "$1"
- fi
-}
-
-# If nightly, install only nightly artifacts.
-if [[ "${KOKORO_BUILD_NIGHTLY:-false}" == "true" ]]; then
- # The "latest" directory and current date.
- stamp="$(date -Idate)"
- install_raw "${KOKORO_ARTIFACTS_DIR}/nightly/latest"
- install_raw "${KOKORO_ARTIFACTS_DIR}/nightly/${stamp}"
- install_repo "${KOKORO_ARTIFACTS_DIR}/dists/nightly"
-else
- # Is it a tagged release? Build that.
- tags="$(git tag --points-at HEAD)"
- if ! [[ -z "${tags}" ]]; then
- # Note that a given commit can match any number of tags. We have to iterate
- # through all possible tags and produce associated artifacts.
- for tag in ${tags}; do
- name=$(echo "${tag}" | cut -d'-' -f2)
- base=$(echo "${name}" | cut -d'.' -f1)
- install_raw "${KOKORO_ARTIFACTS_DIR}/release/${name}"
- install_raw "${KOKORO_ARTIFACTS_DIR}/release/latest"
- install_repo "${KOKORO_ARTIFACTS_DIR}/dists/release"
- install_repo "${KOKORO_ARTIFACTS_DIR}/dists/${base}"
- done
- else
- # Otherwise, assume it is a raw master commit.
- install_raw "${KOKORO_ARTIFACTS_DIR}/master/latest"
- install_repo "${KOKORO_ARTIFACTS_DIR}/dists/master"
- fi
-fi
diff --git a/test/packetimpact/README.md b/test/packetimpact/README.md
index a82ad996a..f46c67a0c 100644
--- a/test/packetimpact/README.md
+++ b/test/packetimpact/README.md
@@ -18,6 +18,27 @@ Packetimpact aims to provide:
* **Control-flow** like for loops, conditionals, and variables.
* **Flexibilty** to specify every byte in a packet or use multiple sockets.
+## How to run packetimpact tests?
+
+Build the test container image by running the following at the root of the
+repository:
+
+```bash
+$ make load-packetimpact
+```
+
+Run a test, e.g. `fin_wait2_timeout`, against Linux:
+
+```bash
+$ bazel test //test/packetimpact/tests:fin_wait2_timeout_linux_test
+```
+
+Run the same test, but against gVisor:
+
+```bash
+$ bazel test //test/packetimpact/tests:fin_wait2_timeout_netstack_test
+```
+
## When to use packetimpact?
There are a few ways to write networking tests for gVisor currently:
diff --git a/test/packetimpact/netdevs/BUILD b/test/packetimpact/netdevs/BUILD
new file mode 100644
index 000000000..422bb9b0c
--- /dev/null
+++ b/test/packetimpact/netdevs/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(
+ licenses = ["notice"],
+)
+
+go_library(
+ name = "netdevs",
+ srcs = ["netdevs.go"],
+ visibility = ["//test/packetimpact:__subpackages__"],
+ deps = [
+ "//pkg/tcpip",
+ "//pkg/tcpip/header",
+ ],
+)
diff --git a/test/packetimpact/netdevs/netdevs.go b/test/packetimpact/netdevs/netdevs.go
new file mode 100644
index 000000000..d2c9cfeaf
--- /dev/null
+++ b/test/packetimpact/netdevs/netdevs.go
@@ -0,0 +1,104 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package netdevs contains utilities for working with network devices.
+package netdevs
+
+import (
+ "fmt"
+ "net"
+ "regexp"
+ "strings"
+
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// A DeviceInfo represents a network device.
+type DeviceInfo struct {
+ MAC net.HardwareAddr
+ IPv4Addr net.IP
+ IPv4Net *net.IPNet
+ IPv6Addr net.IP
+ IPv6Net *net.IPNet
+}
+
+var (
+ deviceLine = regexp.MustCompile(`^\s*\d+: (\w+)`)
+ linkLine = regexp.MustCompile(`^\s*link/\w+ ([0-9a-fA-F:]+)`)
+ inetLine = regexp.MustCompile(`^\s*inet ([0-9./]+)`)
+ inet6Line = regexp.MustCompile(`^\s*inet6 ([0-9a-fA-Z:/]+)`)
+)
+
+// ParseDevices parses the output from `ip addr show` into a map from device
+// name to information about the device.
+func ParseDevices(cmdOutput string) (map[string]DeviceInfo, error) {
+ var currentDevice string
+ var currentInfo DeviceInfo
+ deviceInfos := make(map[string]DeviceInfo)
+ for _, line := range strings.Split(cmdOutput, "\n") {
+ if m := deviceLine.FindStringSubmatch(line); m != nil {
+ if currentDevice != "" {
+ deviceInfos[currentDevice] = currentInfo
+ }
+ currentInfo = DeviceInfo{}
+ currentDevice = m[1]
+ } else if m := linkLine.FindStringSubmatch(line); m != nil {
+ mac, err := net.ParseMAC(m[1])
+ if err != nil {
+ return nil, err
+ }
+ currentInfo.MAC = mac
+ } else if m := inetLine.FindStringSubmatch(line); m != nil {
+ ipv4Addr, ipv4Net, err := net.ParseCIDR(m[1])
+ if err != nil {
+ return nil, err
+ }
+ currentInfo.IPv4Addr = ipv4Addr
+ currentInfo.IPv4Net = ipv4Net
+ } else if m := inet6Line.FindStringSubmatch(line); m != nil {
+ ipv6Addr, ipv6Net, err := net.ParseCIDR(m[1])
+ if err != nil {
+ return nil, err
+ }
+ currentInfo.IPv6Addr = ipv6Addr
+ currentInfo.IPv6Net = ipv6Net
+ }
+ }
+ if currentDevice != "" {
+ deviceInfos[currentDevice] = currentInfo
+ }
+ return deviceInfos, nil
+}
+
+// MACToIP converts the MAC address to an IPv6 link local address as described
+// in RFC 4291 page 20: https://tools.ietf.org/html/rfc4291#page-20
+func MACToIP(mac net.HardwareAddr) net.IP {
+ addr := make([]byte, header.IPv6AddressSize)
+ addr[0] = 0xfe
+ addr[1] = 0x80
+ header.EthernetAdddressToModifiedEUI64IntoBuf(tcpip.LinkAddress(mac), addr[8:])
+ return net.IP(addr)
+}
+
+// FindDeviceByIP finds a DeviceInfo and device name from an IP address in the
+// output of ParseDevices.
+func FindDeviceByIP(ip net.IP, devices map[string]DeviceInfo) (string, DeviceInfo, error) {
+ for dev, info := range devices {
+ if info.IPv4Addr.Equal(ip) {
+ return dev, info, nil
+ }
+ }
+ return "", DeviceInfo{}, fmt.Errorf("can't find %s on any interface", ip)
+}
diff --git a/test/packetimpact/runner/BUILD b/test/packetimpact/runner/BUILD
new file mode 100644
index 000000000..0b68a760a
--- /dev/null
+++ b/test/packetimpact/runner/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_test")
+
+package(
+ default_visibility = ["//test/packetimpact:__subpackages__"],
+ licenses = ["notice"],
+)
+
+go_test(
+ name = "packetimpact_test",
+ srcs = ["packetimpact_test.go"],
+ tags = [
+ # Not intended to be run directly.
+ "local",
+ "manual",
+ ],
+ deps = [
+ "//pkg/test/dockerutil",
+ "//test/packetimpact/netdevs",
+ ],
+)
diff --git a/test/packetimpact/tests/defs.bzl b/test/packetimpact/runner/defs.bzl
index 45dce64ab..ea66b9756 100644
--- a/test/packetimpact/tests/defs.bzl
+++ b/test/packetimpact/runner/defs.bzl
@@ -11,12 +11,10 @@ def _packetimpact_test_impl(ctx):
# permission problems, because all runfiles may not be owned by the
# current user, and no other users will be mapped in that namespace.
# Make sure that everything is readable here.
- "find . -type f -exec chmod a+rx {} \\;",
- "find . -type d -exec chmod a+rx {} \\;",
- "%s %s --posix_server_binary %s --testbench_binary %s $@\n" % (
+ "find . -type f -or -type d -exec chmod a+rx {} \\;",
+ "%s %s --testbench_binary %s $@\n" % (
test_runner.short_path,
" ".join(ctx.attr.flags),
- ctx.files._posix_server_binary[0].short_path,
ctx.files.testbench_binary[0].short_path,
),
])
@@ -38,7 +36,7 @@ _packetimpact_test = rule(
"_test_runner": attr.label(
executable = True,
cfg = "target",
- default = ":test_runner",
+ default = ":packetimpact_test",
),
"_posix_server_binary": attr.label(
cfg = "target",
@@ -69,6 +67,7 @@ def packetimpact_linux_test(
Args:
name: name of the test
testbench_binary: the testbench binary
+ expect_failure: the test must fail
**kwargs: all the other args, forwarded to _packetimpact_test
"""
expect_failure_flag = ["--expect_failure"] if expect_failure else []
@@ -113,8 +112,8 @@ def packetimpact_go_test(name, size = "small", pure = True, expect_linux_failure
name: name of the test
size: size of the test
pure: make a static go binary
- expect_linux_failure: expect the test to fail for Linux
- expect_netstack_failure: expect the test to fail for Netstack
+ expect_linux_failure: the test must fail for Linux
+ expect_netstack_failure: the test must fail for Netstack
**kwargs: all the other args, forwarded to go_test
"""
testbench_binary = name + "_test"
diff --git a/test/packetimpact/runner/packetimpact_test.go b/test/packetimpact/runner/packetimpact_test.go
new file mode 100644
index 000000000..e58a1fb1b
--- /dev/null
+++ b/test/packetimpact/runner/packetimpact_test.go
@@ -0,0 +1,332 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The runner starts docker containers and networking for a packetimpact test.
+package packetimpact_test
+
+import (
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "log"
+ "math/rand"
+ "net"
+ "os"
+ "os/exec"
+ "path"
+ "strings"
+ "testing"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/test/dockerutil"
+ "gvisor.dev/gvisor/test/packetimpact/netdevs"
+)
+
+// stringList implements flag.Value.
+type stringList []string
+
+// String implements flag.Value.String.
+func (l *stringList) String() string {
+ return strings.Join(*l, ",")
+}
+
+// Set implements flag.Value.Set.
+func (l *stringList) Set(value string) error {
+ *l = append(*l, value)
+ return nil
+}
+
+var (
+ dutPlatform = flag.String("dut_platform", "", "either \"linux\" or \"netstack\"")
+ testbenchBinary = flag.String("testbench_binary", "", "path to the testbench binary")
+ tshark = flag.Bool("tshark", false, "use more verbose tshark in logs instead of tcpdump")
+ extraTestArgs = stringList{}
+ expectFailure = flag.Bool("expect_failure", false, "expect that the test will fail when run")
+
+ dutAddr = net.IPv4(0, 0, 0, 10)
+ testbenchAddr = net.IPv4(0, 0, 0, 20)
+)
+
+const ctrlPort = "40000"
+
+// logger implements testutil.Logger.
+//
+// Labels logs based on their source and formats multi-line logs.
+type logger string
+
+// Name implements testutil.Logger.Name.
+func (l logger) Name() string {
+ return string(l)
+}
+
+// Logf implements testutil.Logger.Logf.
+func (l logger) Logf(format string, args ...interface{}) {
+ lines := strings.Split(fmt.Sprintf(format, args...), "\n")
+ log.Printf("%s: %s", l, lines[0])
+ for _, line := range lines[1:] {
+ log.Printf("%*s %s", len(l), "", line)
+ }
+}
+
+func TestOne(t *testing.T) {
+ flag.Var(&extraTestArgs, "extra_test_arg", "extra arguments to pass to the testbench")
+ flag.Parse()
+ if *dutPlatform != "linux" && *dutPlatform != "netstack" {
+ t.Fatal("--dut_platform should be either linux or netstack")
+ }
+ if *testbenchBinary == "" {
+ t.Fatal("--testbench_binary is missing")
+ }
+ if *dutPlatform == "netstack" {
+ if _, err := dockerutil.RuntimePath(); err != nil {
+ t.Fatal("--runtime is missing or invalid with --dut_platform=netstack:", err)
+ }
+ }
+ dockerutil.EnsureSupportedDockerVersion()
+
+ // Create the networks needed for the test. One control network is needed for
+ // the gRPC control packets and one test network on which to transmit the test
+ // packets.
+ ctrlNet := dockerutil.NewDockerNetwork(logger("ctrlNet"))
+ testNet := dockerutil.NewDockerNetwork(logger("testNet"))
+ for _, dn := range []*dockerutil.DockerNetwork{ctrlNet, testNet} {
+ for {
+ if err := createDockerNetwork(dn); err != nil {
+ t.Log("creating docker network:", err)
+ const wait = 100 * time.Millisecond
+ t.Logf("sleeping %s and will try creating docker network again", wait)
+ // This can fail if another docker network claimed the same IP so we'll
+ // just try again.
+ time.Sleep(wait)
+ continue
+ }
+ break
+ }
+ defer func(dn *dockerutil.DockerNetwork) {
+ if err := dn.Cleanup(); err != nil {
+ t.Errorf("unable to cleanup container %s: %s", dn.Name, err)
+ }
+ }(dn)
+ }
+
+ tmpDir, err := ioutil.TempDir("", "container-output")
+ if err != nil {
+ t.Fatal("creating temp dir:", err)
+ }
+ defer os.RemoveAll(tmpDir)
+
+ const testOutputDir = "/tmp/testoutput"
+
+ runOpts := dockerutil.RunOpts{
+ Image: "packetimpact",
+ CapAdd: []string{"NET_ADMIN"},
+ Extra: []string{"--sysctl", "net.ipv6.conf.all.disable_ipv6=0", "--rm", "-v", tmpDir + ":" + testOutputDir},
+ Foreground: true,
+ }
+
+ // Create the Docker container for the DUT.
+ dut := dockerutil.MakeDocker(logger("dut"))
+ if *dutPlatform == "linux" {
+ dut.Runtime = ""
+ }
+
+ const containerPosixServerBinary = "/packetimpact/posix_server"
+ dut.CopyFiles("/packetimpact", "/test/packetimpact/dut/posix_server")
+
+ if err := dut.Create(runOpts, containerPosixServerBinary, "--ip=0.0.0.0", "--port="+ctrlPort); err != nil {
+ t.Fatalf("unable to create container %s: %s", dut.Name, err)
+ }
+ defer dut.CleanUp()
+
+ // Add ctrlNet as eth1 and testNet as eth2.
+ const testNetDev = "eth2"
+ if err := addNetworks(dut, dutAddr, []*dockerutil.DockerNetwork{ctrlNet, testNet}); err != nil {
+ t.Fatal(err)
+ }
+
+ if err := dut.Start(); err != nil {
+ t.Fatalf("unable to start container %s: %s", dut.Name, err)
+ }
+
+ if _, err := dut.WaitForOutput("Server listening.*\n", 60*time.Second); err != nil {
+ t.Fatalf("%s on container %s never listened: %s", containerPosixServerBinary, dut.Name, err)
+ }
+
+ dutTestDevice, dutDeviceInfo, err := deviceByIP(dut, addressInSubnet(dutAddr, *testNet.Subnet))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ remoteMAC := dutDeviceInfo.MAC
+ remoteIPv6 := dutDeviceInfo.IPv6Addr
+ // Netstack as DUT doesn't assign IPv6 addresses automatically so do it if
+ // needed.
+ if remoteIPv6 == nil {
+ if _, err := dut.Exec(dockerutil.RunOpts{}, "ip", "addr", "add", netdevs.MACToIP(remoteMAC).String(), "scope", "link", "dev", dutTestDevice); err != nil {
+ t.Fatalf("unable to ip addr add on container %s: %s", dut.Name, err)
+ }
+ // Now try again, to make sure that it worked.
+ _, dutDeviceInfo, err = deviceByIP(dut, addressInSubnet(dutAddr, *testNet.Subnet))
+ if err != nil {
+ t.Fatal(err)
+ }
+ remoteIPv6 = dutDeviceInfo.IPv6Addr
+ if remoteIPv6 == nil {
+ t.Fatal("unable to set IPv6 address on container", dut.Name)
+ }
+ }
+
+ // Create the Docker container for the testbench.
+ testbench := dockerutil.MakeDocker(logger("testbench"))
+ testbench.Runtime = "" // The testbench always runs on Linux.
+
+ tbb := path.Base(*testbenchBinary)
+ containerTestbenchBinary := "/packetimpact/" + tbb
+ testbench.CopyFiles("/packetimpact", "/test/packetimpact/tests/"+tbb)
+
+ // Run tcpdump in the test bench unbuffered, without DNS resolution, just on
+ // the interface with the test packets.
+ snifferArgs := []string{
+ "tcpdump",
+ "-S", "-vvv", "-U", "-n",
+ "-i", testNetDev,
+ "-w", testOutputDir + "/dump.pcap",
+ }
+ snifferRegex := "tcpdump: listening.*\n"
+ if *tshark {
+ // Run tshark in the test bench unbuffered, without DNS resolution, just on
+ // the interface with the test packets.
+ snifferArgs = []string{
+ "tshark", "-V", "-l", "-n", "-i", testNetDev,
+ "-o", "tcp.check_checksum:TRUE",
+ "-o", "udp.check_checksum:TRUE",
+ }
+ snifferRegex = "Capturing on.*\n"
+ }
+
+ defer func() {
+ if err := exec.Command("/bin/cp", "-r", tmpDir, os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR")).Run(); err != nil {
+ t.Error("unable to copy container output files:", err)
+ }
+ }()
+
+ if err := testbench.Create(runOpts, snifferArgs...); err != nil {
+ t.Fatalf("unable to create container %s: %s", testbench.Name, err)
+ }
+ defer testbench.CleanUp()
+
+ // Add ctrlNet as eth1 and testNet as eth2.
+ if err := addNetworks(testbench, testbenchAddr, []*dockerutil.DockerNetwork{ctrlNet, testNet}); err != nil {
+ t.Fatal(err)
+ }
+
+ if err := testbench.Start(); err != nil {
+ t.Fatalf("unable to start container %s: %s", testbench.Name, err)
+ }
+
+ // Kill so that it will flush output.
+ defer testbench.Exec(dockerutil.RunOpts{}, "killall", snifferArgs[0])
+
+ if _, err := testbench.WaitForOutput(snifferRegex, 60*time.Second); err != nil {
+ t.Fatalf("sniffer on %s never listened: %s", dut.Name, err)
+ }
+
+ // Because the Linux kernel receives the SYN-ACK but didn't send the SYN it
+ // will issue a RST. To prevent this IPtables can be used to filter out all
+ // incoming packets. The raw socket that packetimpact tests use will still see
+ // everything.
+ if _, err := testbench.Exec(dockerutil.RunOpts{}, "iptables", "-A", "INPUT", "-i", testNetDev, "-j", "DROP"); err != nil {
+ t.Fatalf("unable to Exec iptables on container %s: %s", testbench.Name, err)
+ }
+
+ // FIXME(b/156449515): Some piece of the system has a race. The old
+ // bash script version had a sleep, so we have one too. The race should
+ // be fixed and this sleep removed.
+ time.Sleep(time.Second)
+
+ // Start a packetimpact test on the test bench. The packetimpact test sends
+ // and receives packets and also sends POSIX socket commands to the
+ // posix_server to be executed on the DUT.
+ testArgs := []string{containerTestbenchBinary}
+ testArgs = append(testArgs, extraTestArgs...)
+ testArgs = append(testArgs,
+ "--posix_server_ip", addressInSubnet(dutAddr, *ctrlNet.Subnet).String(),
+ "--posix_server_port", ctrlPort,
+ "--remote_ipv4", addressInSubnet(dutAddr, *testNet.Subnet).String(),
+ "--local_ipv4", addressInSubnet(testbenchAddr, *testNet.Subnet).String(),
+ "--remote_ipv6", remoteIPv6.String(),
+ "--remote_mac", remoteMAC.String(),
+ "--device", testNetDev,
+ )
+ _, err = testbench.Exec(dockerutil.RunOpts{}, testArgs...)
+ if !*expectFailure && err != nil {
+ t.Fatal("test failed:", err)
+ }
+ if *expectFailure && err == nil {
+ t.Fatal("test failure expected but the test succeeded, enable the test and mark the corresponding bug as fixed")
+ }
+}
+
+func addNetworks(d *dockerutil.Docker, addr net.IP, networks []*dockerutil.DockerNetwork) error {
+ for _, dn := range networks {
+ ip := addressInSubnet(addr, *dn.Subnet)
+ // Connect to the network with the specified IP address.
+ if err := dn.Connect(d, "--ip", ip.String()); err != nil {
+ return fmt.Errorf("unable to connect container %s to network %s: %w", d.Name, dn.Name, err)
+ }
+ }
+ return nil
+}
+
+// addressInSubnet combines the subnet provided with the address and returns a
+// new address. The return address bits come from the subnet where the mask is 1
+// and from the ip address where the mask is 0.
+func addressInSubnet(addr net.IP, subnet net.IPNet) net.IP {
+ var octets []byte
+ for i := 0; i < 4; i++ {
+ octets = append(octets, (subnet.IP.To4()[i]&subnet.Mask[i])+(addr.To4()[i]&(^subnet.Mask[i])))
+ }
+ return net.IP(octets)
+}
+
+// makeDockerNetwork makes a randomly-named network that will start with the
+// namePrefix. The network will be a random /24 subnet.
+func createDockerNetwork(n *dockerutil.DockerNetwork) error {
+ randSource := rand.NewSource(time.Now().UnixNano())
+ r1 := rand.New(randSource)
+ // Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
+ ip := net.IPv4(byte(r1.Intn(224-192)+192), byte(r1.Intn(256)), byte(r1.Intn(256)), 0)
+ n.Subnet = &net.IPNet{
+ IP: ip,
+ Mask: ip.DefaultMask(),
+ }
+ return n.Create()
+}
+
+// deviceByIP finds a deviceInfo and device name from an IP address.
+func deviceByIP(d *dockerutil.Docker, ip net.IP) (string, netdevs.DeviceInfo, error) {
+ out, err := d.Exec(dockerutil.RunOpts{}, "ip", "addr", "show")
+ if err != nil {
+ return "", netdevs.DeviceInfo{}, fmt.Errorf("listing devices on %s container: %w", d.Name, err)
+ }
+ devs, err := netdevs.ParseDevices(out)
+ if err != nil {
+ return "", netdevs.DeviceInfo{}, fmt.Errorf("parsing devices from %s container: %w", d.Name, err)
+ }
+ testDevice, deviceInfo, err := netdevs.FindDeviceByIP(ip, devs)
+ if err != nil {
+ return "", netdevs.DeviceInfo{}, fmt.Errorf("can't find deviceInfo for container %s: %w", d.Name, err)
+ }
+ return testDevice, deviceInfo, nil
+}
diff --git a/test/packetimpact/testbench/BUILD b/test/packetimpact/testbench/BUILD
index 682933067..d19ec07d4 100644
--- a/test/packetimpact/testbench/BUILD
+++ b/test/packetimpact/testbench/BUILD
@@ -21,6 +21,7 @@ go_library(
"//pkg/tcpip/header",
"//pkg/tcpip/seqnum",
"//pkg/usermem",
+ "//test/packetimpact/netdevs",
"//test/packetimpact/proto:posix_server_go_proto",
"@com_github_google_go-cmp//cmp:go_default_library",
"@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 463fd0556..bf104e5ca 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -114,12 +114,12 @@ var _ layerState = (*etherState)(nil)
func newEtherState(out, in Ether) (*etherState, error) {
lMAC, err := tcpip.ParseMACAddress(LocalMAC)
if err != nil {
- return nil, err
+ return nil, fmt.Errorf("parsing local MAC: %q: %w", LocalMAC, err)
}
rMAC, err := tcpip.ParseMACAddress(RemoteMAC)
if err != nil {
- return nil, err
+ return nil, fmt.Errorf("parsing remote MAC: %q: %w", RemoteMAC, err)
}
s := etherState{
out: Ether{SrcAddr: &lMAC, DstAddr: &rMAC},
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
index a78b7d7ee..f3e326adf 100644
--- a/test/packetimpact/testbench/dut.go
+++ b/test/packetimpact/testbench/dut.go
@@ -16,6 +16,7 @@ package testbench
import (
"context"
+ "flag"
"net"
"strconv"
"syscall"
@@ -37,6 +38,11 @@ type DUT struct {
// NewDUT creates a new connection with the DUT over gRPC.
func NewDUT(t *testing.T) DUT {
+ flag.Parse()
+ if err := genPseudoFlags(); err != nil {
+ t.Fatal("generating psuedo flags:", err)
+ }
+
posixServerAddress := POSIXServerIP + ":" + strconv.Itoa(POSIXServerPort)
conn, err := grpc.Dial(posixServerAddress, grpc.WithInsecure(), grpc.WithKeepaliveParams(keepalive.ClientParameters{Timeout: RPCKeepalive}))
if err != nil {
@@ -235,7 +241,9 @@ func (dut *DUT) Connect(fd int32, sa unix.Sockaddr) {
ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
defer cancel()
ret, err := dut.ConnectWithErrno(ctx, fd, sa)
- if ret != 0 {
+ // Ignore 'operation in progress' error that can be returned when the socket
+ // is non-blocking.
+ if err != syscall.Errno(unix.EINPROGRESS) && ret != 0 {
dut.t.Fatalf("failed to connect socket: %s", err)
}
}
diff --git a/test/packetimpact/testbench/rawsockets.go b/test/packetimpact/testbench/rawsockets.go
index 4665f60b2..278229b7e 100644
--- a/test/packetimpact/testbench/rawsockets.go
+++ b/test/packetimpact/testbench/rawsockets.go
@@ -16,7 +16,6 @@ package testbench
import (
"encoding/binary"
- "flag"
"fmt"
"math"
"net"
@@ -41,7 +40,6 @@ func htons(x uint16) uint16 {
// NewSniffer creates a Sniffer connected to *device.
func NewSniffer(t *testing.T) (Sniffer, error) {
- flag.Parse()
snifferFd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, int(htons(unix.ETH_P_ALL)))
if err != nil {
return Sniffer{}, err
@@ -136,7 +134,6 @@ type Injector struct {
// NewInjector creates a new injector on *device.
func NewInjector(t *testing.T) (Injector, error) {
- flag.Parse()
ifInfo, err := net.InterfaceByName(Device)
if err != nil {
return Injector{}, err
diff --git a/test/packetimpact/testbench/testbench.go b/test/packetimpact/testbench/testbench.go
index a1242b189..4de2aa1d3 100644
--- a/test/packetimpact/testbench/testbench.go
+++ b/test/packetimpact/testbench/testbench.go
@@ -16,7 +16,12 @@ package testbench
import (
"flag"
+ "fmt"
+ "net"
+ "os/exec"
"time"
+
+ "gvisor.dev/gvisor/test/packetimpact/netdevs"
)
var (
@@ -55,9 +60,31 @@ func RegisterFlags(fs *flag.FlagSet) {
fs.DurationVar(&RPCKeepalive, "rpc_keepalive", RPCKeepalive, "gRPC keepalive")
fs.StringVar(&LocalIPv4, "local_ipv4", LocalIPv4, "local IPv4 address for test packets")
fs.StringVar(&RemoteIPv4, "remote_ipv4", RemoteIPv4, "remote IPv4 address for test packets")
- fs.StringVar(&LocalIPv6, "local_ipv6", LocalIPv6, "local IPv6 address for test packets")
fs.StringVar(&RemoteIPv6, "remote_ipv6", RemoteIPv6, "remote IPv6 address for test packets")
- fs.StringVar(&LocalMAC, "local_mac", LocalMAC, "local mac address for test packets")
fs.StringVar(&RemoteMAC, "remote_mac", RemoteMAC, "remote mac address for test packets")
fs.StringVar(&Device, "device", Device, "local device for test packets")
}
+
+// genPseudoFlags populates flag-like global config based on real flags.
+//
+// genPseudoFlags must only be called after flag.Parse.
+func genPseudoFlags() error {
+ out, err := exec.Command("ip", "addr", "show").CombinedOutput()
+ if err != nil {
+ return fmt.Errorf("listing devices: %q: %w", string(out), err)
+ }
+ devs, err := netdevs.ParseDevices(string(out))
+ if err != nil {
+ return fmt.Errorf("parsing devices: %w", err)
+ }
+
+ _, deviceInfo, err := netdevs.FindDeviceByIP(net.ParseIP(LocalIPv4), devs)
+ if err != nil {
+ return fmt.Errorf("can't find deviceInfo: %w", err)
+ }
+
+ LocalMAC = deviceInfo.MAC.String()
+ LocalIPv6 = deviceInfo.IPv6Addr.String()
+
+ return nil
+}
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index c4ffda17e..9cd695200 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -1,4 +1,4 @@
-load("defs.bzl", "packetimpact_go_test")
+load("//test/packetimpact/runner:defs.bzl", "packetimpact_go_test")
package(
default_visibility = ["//test/packetimpact:__subpackages__"],
@@ -16,6 +16,19 @@ packetimpact_go_test(
)
packetimpact_go_test(
+ name = "ipv4_id_uniqueness",
+ srcs = ["ipv4_id_uniqueness_test.go"],
+ # TODO(b/157506701) Fix netstack then remove the line below.
+ expect_netstack_failure = True,
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/tcpip/header",
+ "//test/packetimpact/testbench",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+packetimpact_go_test(
name = "udp_recv_multicast",
srcs = ["udp_recv_multicast_test.go"],
# TODO(b/152813495): Fix netstack then remove the line below.
@@ -157,6 +170,26 @@ packetimpact_go_test(
)
packetimpact_go_test(
+ name = "tcp_synsent_reset",
+ srcs = ["tcp_synsent_reset_test.go"],
+ deps = [
+ "//pkg/tcpip/header",
+ "//test/packetimpact/testbench",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+packetimpact_go_test(
+ name = "tcp_synrcvd_reset",
+ srcs = ["tcp_synrcvd_reset_test.go"],
+ deps = [
+ "//pkg/tcpip/header",
+ "//test/packetimpact/testbench",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+packetimpact_go_test(
name = "icmpv6_param_problem",
srcs = ["icmpv6_param_problem_test.go"],
# TODO(b/153485026): Fix netstack then remove the line below.
@@ -177,8 +210,3 @@ packetimpact_go_test(
"@org_golang_x_sys//unix:go_default_library",
],
)
-
-sh_binary(
- name = "test_runner",
- srcs = ["test_runner.sh"],
-)
diff --git a/test/packetimpact/tests/ipv4_id_uniqueness_test.go b/test/packetimpact/tests/ipv4_id_uniqueness_test.go
new file mode 100644
index 000000000..49e481d0b
--- /dev/null
+++ b/test/packetimpact/tests/ipv4_id_uniqueness_test.go
@@ -0,0 +1,111 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv4_id_uniqueness_test
+
+import (
+ "context"
+ "flag"
+ "fmt"
+ "testing"
+ "time"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+ tb.RegisterFlags(flag.CommandLine)
+}
+
+func recvTCPSegment(conn *tb.TCPIPv4, expect *tb.TCP, expectPayload *tb.Payload) (uint16, error) {
+ layers, err := conn.ExpectData(expect, expectPayload, time.Second)
+ if err != nil {
+ return 0, fmt.Errorf("failed to receive TCP segment: %s", err)
+ }
+ if len(layers) < 2 {
+ return 0, fmt.Errorf("got packet with layers: %v, expected to have at least 2 layers (link and network)", layers)
+ }
+ ipv4, ok := layers[1].(*tb.IPv4)
+ if !ok {
+ return 0, fmt.Errorf("got network layer: %T, expected: *IPv4", layers[1])
+ }
+ if *ipv4.Flags&header.IPv4FlagDontFragment != 0 {
+ return 0, fmt.Errorf("got IPv4 DF=1, expected DF=0")
+ }
+ return *ipv4.ID, nil
+}
+
+// RFC 6864 section 4.2 states: "The IPv4 ID of non-atomic datagrams MUST NOT
+// be reused when sending a copy of an earlier non-atomic datagram."
+//
+// This test creates a TCP connection, uses the IP_MTU_DISCOVER socket option
+// to force the DF bit to be 0, and checks that a retransmitted segment has a
+// different IPv4 Identification value than the original segment.
+func TestIPv4RetransmitIdentificationUniqueness(t *testing.T) {
+ dut := tb.NewDUT(t)
+ defer dut.TearDown()
+
+ listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+ defer dut.Close(listenFD)
+
+ conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+ defer conn.Close()
+
+ conn.Handshake()
+ remoteFD, _ := dut.Accept(listenFD)
+ defer dut.Close(remoteFD)
+
+ dut.SetSockOptInt(remoteFD, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+ // TODO(b/129291778) The following socket option clears the DF bit on
+ // IP packets sent over the socket, and is currently not supported by
+ // gVisor. gVisor by default sends packets with DF=0 anyway, so the
+ // socket option being not supported does not affect the operation of
+ // this test. Once the socket option is supported, the following call
+ // can be changed to simply assert success.
+ ret, errno := dut.SetSockOptIntWithErrno(context.Background(), remoteFD, unix.IPPROTO_IP, linux.IP_MTU_DISCOVER, linux.IP_PMTUDISC_DONT)
+ if ret == -1 && errno != unix.ENOTSUP {
+ t.Fatalf("failed to set IP_MTU_DISCOVER socket option to IP_PMTUDISC_DONT: %s", errno)
+ }
+
+ sampleData := []byte("Sample Data")
+ samplePayload := &tb.Payload{Bytes: sampleData}
+
+ dut.Send(remoteFD, sampleData, 0)
+ if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil {
+ t.Fatalf("failed to receive TCP segment sent for RTT calculation: %s", err)
+ }
+ // Let the DUT estimate RTO with RTT from the DATA-ACK.
+ // TODO(gvisor.dev/issue/2685) Estimate RTO during handshake, after which
+ // we can skip sending this ACK.
+ conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+
+ expectTCP := &tb.TCP{SeqNum: tb.Uint32(uint32(*conn.RemoteSeqNum()))}
+ dut.Send(remoteFD, sampleData, 0)
+ originalID, err := recvTCPSegment(&conn, expectTCP, samplePayload)
+ if err != nil {
+ t.Fatalf("failed to receive TCP segment: %s", err)
+ }
+
+ retransmitID, err := recvTCPSegment(&conn, expectTCP, samplePayload)
+ if err != nil {
+ t.Fatalf("failed to receive retransmitted TCP segment: %s", err)
+ }
+ if originalID == retransmitID {
+ t.Fatalf("unexpectedly got retransmitted TCP segment with same IPv4 ID field=%d", originalID)
+ }
+}
diff --git a/test/packetimpact/tests/tcp_synrcvd_reset_test.go b/test/packetimpact/tests/tcp_synrcvd_reset_test.go
new file mode 100644
index 000000000..e6ba84cab
--- /dev/null
+++ b/test/packetimpact/tests/tcp_synrcvd_reset_test.go
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_syn_reset_test
+
+import (
+ "flag"
+ "testing"
+ "time"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+ tb.RegisterFlags(flag.CommandLine)
+}
+
+// TestTCPSynRcvdReset tests transition from SYN-RCVD to CLOSED.
+func TestTCPSynRcvdReset(t *testing.T) {
+ dut := tb.NewDUT(t)
+ defer dut.TearDown()
+ listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+ defer dut.Close(listenFD)
+ conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+ defer conn.Close()
+
+ // Expect dut connection to have transitioned to SYN-RCVD state.
+ conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)})
+ if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+ t.Fatalf("expected SYN-ACK %s", err)
+ }
+ conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)})
+ // Expect the connection to have transitioned SYN-RCVD to CLOSED.
+ // TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side.
+ conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+ if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
+ t.Fatalf("expected a TCP RST %s", err)
+ }
+}
diff --git a/test/packetimpact/tests/tcp_synsent_reset_test.go b/test/packetimpact/tests/tcp_synsent_reset_test.go
new file mode 100644
index 000000000..6898a2239
--- /dev/null
+++ b/test/packetimpact/tests/tcp_synsent_reset_test.go
@@ -0,0 +1,88 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_synsent_reset_test
+
+import (
+ "flag"
+ "net"
+ "testing"
+ "time"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+ tb.RegisterFlags(flag.CommandLine)
+}
+
+// dutSynSentState sets up the dut connection in SYN-SENT state.
+func dutSynSentState(t *testing.T) (*tb.DUT, *tb.TCPIPv4, uint16, uint16) {
+ dut := tb.NewDUT(t)
+
+ clientFD, clientPort := dut.CreateBoundSocket(unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, net.ParseIP(tb.RemoteIPv4))
+ port := uint16(9001)
+ conn := tb.NewTCPIPv4(t, tb.TCP{SrcPort: &port, DstPort: &clientPort}, tb.TCP{SrcPort: &clientPort, DstPort: &port})
+
+ sa := unix.SockaddrInet4{Port: int(port)}
+ copy(sa.Addr[:], net.IP(net.ParseIP(tb.LocalIPv4)).To4())
+ // Bring the dut to SYN-SENT state with a non-blocking connect.
+ dut.Connect(clientFD, &sa)
+ if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)}, nil, time.Second); err != nil {
+ t.Fatalf("expected SYN\n")
+ }
+
+ return &dut, &conn, port, clientPort
+}
+
+// TestTCPSynSentReset tests RFC793, p67: SYN-SENT to CLOSED transition.
+func TestTCPSynSentReset(t *testing.T) {
+ dut, conn, _, _ := dutSynSentState(t)
+ defer conn.Close()
+ defer dut.TearDown()
+ conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst | header.TCPFlagAck)})
+ // Expect the connection to have closed.
+ // TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side.
+ conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+ if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
+ t.Fatalf("expected a TCP RST")
+ }
+}
+
+// TestTCPSynSentRcvdReset tests RFC793, p70, SYN-SENT to SYN-RCVD to CLOSED
+// transitions.
+func TestTCPSynSentRcvdReset(t *testing.T) {
+ dut, c, remotePort, clientPort := dutSynSentState(t)
+ defer dut.TearDown()
+ defer c.Close()
+
+ conn := tb.NewTCPIPv4(t, tb.TCP{SrcPort: &remotePort, DstPort: &clientPort}, tb.TCP{SrcPort: &clientPort, DstPort: &remotePort})
+ defer conn.Close()
+ // Initiate new SYN connection with the same port pair
+ // (simultaneous open case), expect the dut connection to move to
+ // SYN-RCVD state
+ conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)})
+ if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+ t.Fatalf("expected SYN-ACK %s\n", err)
+ }
+ conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)})
+ // Expect the connection to have transitioned SYN-RCVD to CLOSED.
+ // TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side.
+ conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+ if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
+ t.Fatalf("expected a TCP RST")
+ }
+}
diff --git a/test/packetimpact/tests/test_runner.sh b/test/packetimpact/tests/test_runner.sh
deleted file mode 100755
index 706441cce..000000000
--- a/test/packetimpact/tests/test_runner.sh
+++ /dev/null
@@ -1,325 +0,0 @@
-#!/bin/bash
-
-# Copyright 2020 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Run a packetimpact test. Two docker containers are made, one for the
-# Device-Under-Test (DUT) and one for the test bench. Each is attached with
-# two networks, one for control packets that aid the test and one for test
-# packets which are sent as part of the test and observed for correctness.
-
-set -euxo pipefail
-
-function failure() {
- local lineno=$1
- local msg=$2
- local filename="$0"
- echo "FAIL: $filename:$lineno: $msg"
-}
-trap 'failure ${LINENO} "$BASH_COMMAND"' ERR
-
-declare -r LONGOPTS="dut_platform:,posix_server_binary:,testbench_binary:,runtime:,tshark,extra_test_arg:,expect_failure"
-
-# Don't use declare below so that the error from getopt will end the script.
-PARSED=$(getopt --options "" --longoptions=$LONGOPTS --name "$0" -- "$@")
-
-eval set -- "$PARSED"
-
-declare -a EXTRA_TEST_ARGS
-
-while true; do
- case "$1" in
- --dut_platform)
- # Either "linux" or "netstack".
- declare -r DUT_PLATFORM="$2"
- shift 2
- ;;
- --posix_server_binary)
- declare -r POSIX_SERVER_BINARY="$2"
- shift 2
- ;;
- --testbench_binary)
- declare -r TESTBENCH_BINARY="$2"
- shift 2
- ;;
- --runtime)
- # Not readonly because there might be multiple --runtime arguments and we
- # want to use just the last one. Only used if --dut_platform is
- # "netstack".
- declare RUNTIME="$2"
- shift 2
- ;;
- --tshark)
- declare -r TSHARK="1"
- shift 1
- ;;
- --extra_test_arg)
- EXTRA_TEST_ARGS+="$2"
- shift 2
- ;;
- --expect_failure)
- declare -r EXPECT_FAILURE="1"
- shift 1
- ;;
- --)
- shift
- break
- ;;
- *)
- echo "Programming error"
- exit 3
- esac
-done
-
-# All the other arguments are scripts.
-declare -r scripts="$@"
-
-# Check that the required flags are defined in a way that is safe for "set -u".
-if [[ "${DUT_PLATFORM-}" == "netstack" ]]; then
- if [[ -z "${RUNTIME-}" ]]; then
- echo "FAIL: Missing --runtime argument: ${RUNTIME-}"
- exit 2
- fi
- declare -r RUNTIME_ARG="--runtime ${RUNTIME}"
-elif [[ "${DUT_PLATFORM-}" == "linux" ]]; then
- declare -r RUNTIME_ARG=""
-else
- echo "FAIL: Bad or missing --dut_platform argument: ${DUT_PLATFORM-}"
- exit 2
-fi
-if [[ ! -f "${POSIX_SERVER_BINARY-}" ]]; then
- echo "FAIL: Bad or missing --posix_server_binary: ${POSIX_SERVER-}"
- exit 2
-fi
-if [[ ! -f "${TESTBENCH_BINARY-}" ]]; then
- echo "FAIL: Bad or missing --testbench_binary: ${TESTBENCH_BINARY-}"
- exit 2
-fi
-
-function new_net_prefix() {
- # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
- echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)"
-}
-
-# Variables specific to the control network and interface start with CTRL_.
-# Variables specific to the test network and interface start with TEST_.
-# Variables specific to the DUT start with DUT_.
-# Variables specific to the test bench start with TESTBENCH_.
-# Use random numbers so that test networks don't collide.
-declare CTRL_NET="ctrl_net-${RANDOM}${RANDOM}"
-declare CTRL_NET_PREFIX=$(new_net_prefix)
-declare TEST_NET="test_net-${RANDOM}${RANDOM}"
-declare TEST_NET_PREFIX=$(new_net_prefix)
-# On both DUT and test bench, testing packets are on the eth2 interface.
-declare -r TEST_DEVICE="eth2"
-# Number of bits in the *_NET_PREFIX variables.
-declare -r NET_MASK="24"
-# Last bits of the DUT's IP address.
-declare -r DUT_NET_SUFFIX=".10"
-# Control port.
-declare -r CTRL_PORT="40000"
-# Last bits of the test bench's IP address.
-declare -r TESTBENCH_NET_SUFFIX=".20"
-declare -r TIMEOUT="60"
-declare -r IMAGE_TAG="gcr.io/gvisor-presubmit/packetimpact"
-
-# Make sure that docker is installed.
-docker --version
-
-function finish {
- local cleanup_success=1
-
- if [[ -z "${TSHARK-}" ]]; then
- # Kill tcpdump so that it will flush output.
- docker exec -t "${TESTBENCH}" \
- killall tcpdump || \
- cleanup_success=0
- else
- # Kill tshark so that it will flush output.
- docker exec -t "${TESTBENCH}" \
- killall tshark || \
- cleanup_success=0
- fi
-
- for net in "${CTRL_NET}" "${TEST_NET}"; do
- # Kill all processes attached to ${net}.
- for docker_command in "kill" "rm"; do
- (docker network inspect "${net}" \
- --format '{{range $key, $value := .Containers}}{{$key}} {{end}}' \
- | xargs -r docker "${docker_command}") || \
- cleanup_success=0
- done
- # Remove the network.
- docker network rm "${net}" || \
- cleanup_success=0
- done
-
- if ((!$cleanup_success)); then
- echo "FAIL: Cleanup command failed"
- exit 4
- fi
-}
-trap finish EXIT
-
-# Subnet for control packets between test bench and DUT.
-while ! docker network create \
- "--subnet=${CTRL_NET_PREFIX}.0/${NET_MASK}" "${CTRL_NET}"; do
- sleep 0.1
- CTRL_NET_PREFIX=$(new_net_prefix)
- CTRL_NET="ctrl_net-${RANDOM}${RANDOM}"
-done
-
-# Subnet for the packets that are part of the test.
-while ! docker network create \
- "--subnet=${TEST_NET_PREFIX}.0/${NET_MASK}" "${TEST_NET}"; do
- sleep 0.1
- TEST_NET_PREFIX=$(new_net_prefix)
- TEST_NET="test_net-${RANDOM}${RANDOM}"
-done
-
-docker pull "${IMAGE_TAG}"
-
-# Create the DUT container and connect to network.
-DUT=$(docker create ${RUNTIME_ARG} --privileged --rm \
- --cap-add NET_ADMIN \
- --sysctl net.ipv6.conf.all.disable_ipv6=0 \
- --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
-docker network connect "${CTRL_NET}" \
- --ip "${CTRL_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \
- || (docker kill ${DUT}; docker rm ${DUT}; false)
-docker network connect "${TEST_NET}" \
- --ip "${TEST_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \
- || (docker kill ${DUT}; docker rm ${DUT}; false)
-docker start "${DUT}"
-
-# Create the test bench container and connect to network.
-TESTBENCH=$(docker create --privileged --rm \
- --cap-add NET_ADMIN \
- --sysctl net.ipv6.conf.all.disable_ipv6=0 \
- --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
-docker network connect "${CTRL_NET}" \
- --ip "${CTRL_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" "${TESTBENCH}" \
- || (docker kill ${TESTBENCH}; docker rm ${TESTBENCH}; false)
-docker network connect "${TEST_NET}" \
- --ip "${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" "${TESTBENCH}" \
- || (docker kill ${TESTBENCH}; docker rm ${TESTBENCH}; false)
-docker start "${TESTBENCH}"
-
-# Start the posix_server in the DUT.
-declare -r DOCKER_POSIX_SERVER_BINARY="/$(basename ${POSIX_SERVER_BINARY})"
-docker cp -L ${POSIX_SERVER_BINARY} "${DUT}:${DOCKER_POSIX_SERVER_BINARY}"
-
-docker exec -t "${DUT}" \
- /bin/bash -c "${DOCKER_POSIX_SERVER_BINARY} \
- --ip ${CTRL_NET_PREFIX}${DUT_NET_SUFFIX} \
- --port ${CTRL_PORT}" &
-
-# Because the Linux kernel receives the SYN-ACK but didn't send the SYN it will
-# issue a RST. To prevent this IPtables can be used to filter those out.
-docker exec "${TESTBENCH}" \
- iptables -A INPUT -i ${TEST_DEVICE} -j DROP
-
-# Wait for the DUT server to come up. Attempt to connect to it from the test
-# bench every 100 milliseconds until success.
-while ! docker exec "${TESTBENCH}" \
- nc -zv "${CTRL_NET_PREFIX}${DUT_NET_SUFFIX}" "${CTRL_PORT}"; do
- sleep 0.1
-done
-
-declare -r REMOTE_MAC=$(docker exec -t "${DUT}" ip link show \
- "${TEST_DEVICE}" | tail -1 | cut -d' ' -f6)
-declare -r LOCAL_MAC=$(docker exec -t "${TESTBENCH}" ip link show \
- "${TEST_DEVICE}" | tail -1 | cut -d' ' -f6)
-declare REMOTE_IPV6=$(docker exec -t "${DUT}" ip addr show scope link \
- "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1)
-declare -r LOCAL_IPV6=$(docker exec -t "${TESTBENCH}" ip addr show scope link \
- "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1)
-
-# Netstack as DUT doesn't assign IPv6 addresses automatically so do it if
-# needed. Convert the MAC address to an IPv6 link local address as described in
-# RFC 4291 page 20: https://tools.ietf.org/html/rfc4291#page-20
-if [[ -z "${REMOTE_IPV6}" ]]; then
- # Split the octets of the MAC into an array of strings.
- IFS=":" read -a REMOTE_OCTETS <<< "${REMOTE_MAC}"
- # Flip the global bit.
- REMOTE_OCTETS[0]=$(printf '%x' "$((0x${REMOTE_OCTETS[0]} ^ 2))")
- # Add the IPv6 address.
- docker exec "${DUT}" \
- ip addr add $(printf 'fe80::%02x%02x:%02xff:fe%02x:%02x%02x/64' \
- "0x${REMOTE_OCTETS[0]}" "0x${REMOTE_OCTETS[1]}" "0x${REMOTE_OCTETS[2]}" \
- "0x${REMOTE_OCTETS[3]}" "0x${REMOTE_OCTETS[4]}" "0x${REMOTE_OCTETS[5]}") \
- scope link \
- dev "${TEST_DEVICE}"
- # Re-extract the IPv6 address.
- # TODO(eyalsoha): Add "scope link" below when netstack supports correctly
- # creating link-local IPv6 addresses.
- REMOTE_IPV6=$(docker exec -t "${DUT}" ip addr show \
- "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1)
-fi
-
-declare -r DOCKER_TESTBENCH_BINARY="/$(basename ${TESTBENCH_BINARY})"
-docker cp -L "${TESTBENCH_BINARY}" "${TESTBENCH}:${DOCKER_TESTBENCH_BINARY}"
-
-if [[ -z "${TSHARK-}" ]]; then
- # Run tcpdump in the test bench unbuffered, without dns resolution, just on
- # the interface with the test packets.
- docker exec -t "${TESTBENCH}" \
- tcpdump -S -vvv -U -n -i "${TEST_DEVICE}" \
- net "${TEST_NET_PREFIX}/24" or \
- host "${REMOTE_IPV6}" or \
- host "${LOCAL_IPV6}" &
-else
- # Run tshark in the test bench unbuffered, without dns resolution, just on the
- # interface with the test packets.
- docker exec -t "${TESTBENCH}" \
- tshark -V -l -n -i "${TEST_DEVICE}" \
- -o tcp.check_checksum:TRUE \
- -o udp.check_checksum:TRUE \
- net "${TEST_NET_PREFIX}/24" or \
- host "${REMOTE_IPV6}" or \
- host "${LOCAL_IPV6}" &
-fi
-
-# tcpdump and tshark take time to startup
-sleep 3
-
-# Start a packetimpact test on the test bench. The packetimpact test sends and
-# receives packets and also sends POSIX socket commands to the posix_server to
-# be executed on the DUT.
-docker exec \
- -e XML_OUTPUT_FILE="/test.xml" \
- -e TEST_TARGET \
- -t "${TESTBENCH}" \
- /bin/bash -c "${DOCKER_TESTBENCH_BINARY} \
- ${EXTRA_TEST_ARGS[@]-} \
- --posix_server_ip=${CTRL_NET_PREFIX}${DUT_NET_SUFFIX} \
- --posix_server_port=${CTRL_PORT} \
- --remote_ipv4=${TEST_NET_PREFIX}${DUT_NET_SUFFIX} \
- --local_ipv4=${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX} \
- --remote_ipv6=${REMOTE_IPV6} \
- --local_ipv6=${LOCAL_IPV6} \
- --remote_mac=${REMOTE_MAC} \
- --local_mac=${LOCAL_MAC} \
- --device=${TEST_DEVICE}" && true
-declare -r TEST_RESULT="${?}"
-if [[ -z "${EXPECT_FAILURE-}" && "${TEST_RESULT}" != 0 ]]; then
- echo 'FAIL: This test was expected to pass.'
- exit ${TEST_RESULT}
-fi
-if [[ ! -z "${EXPECT_FAILURE-}" && "${TEST_RESULT}" == 0 ]]; then
- echo 'FAIL: This test was expected to fail but passed. Enable the test and' \
- 'mark the corresponding bug as fixed.'
- exit 1
-fi
-echo PASS: No errors.
diff --git a/test/root/BUILD b/test/root/BUILD
index 639e293e3..a9e91ccd6 100644
--- a/test/root/BUILD
+++ b/test/root/BUILD
@@ -33,6 +33,7 @@ go_test(
],
visibility = ["//:sandbox"],
deps = [
+ "//pkg/cleanup",
"//pkg/test/criutil",
"//pkg/test/dockerutil",
"//pkg/test/testutil",
diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go
index 85007dcce..c138e02dc 100644
--- a/test/root/crictl_test.go
+++ b/test/root/crictl_test.go
@@ -30,10 +30,10 @@ import (
"testing"
"time"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/test/criutil"
"gvisor.dev/gvisor/pkg/test/dockerutil"
"gvisor.dev/gvisor/pkg/test/testutil"
- "gvisor.dev/gvisor/runsc/specutils"
)
// Tests for crictl have to be run as root (rather than in a user namespace)
@@ -272,27 +272,20 @@ disabled_plugins = ["restart"]
// * Runs containerd and waits for it to reach a "ready" state for testing.
// * Returns a cleanup function that should be called at the end of the test.
func setup(t *testing.T) (*criutil.Crictl, func(), error) {
- var cleanups []func()
- cleanupFunc := func() {
- for i := len(cleanups) - 1; i >= 0; i-- {
- cleanups[i]()
- }
- }
- cleanup := specutils.MakeCleanup(cleanupFunc)
- defer cleanup.Clean()
-
// Create temporary containerd root and state directories, and a socket
// via which crictl and containerd communicate.
containerdRoot, err := ioutil.TempDir(testutil.TmpDir(), "containerd-root")
if err != nil {
t.Fatalf("failed to create containerd root: %v", err)
}
- cleanups = append(cleanups, func() { os.RemoveAll(containerdRoot) })
+ cu := cleanup.Make(func() { os.RemoveAll(containerdRoot) })
+ defer cu.Clean()
+
containerdState, err := ioutil.TempDir(testutil.TmpDir(), "containerd-state")
if err != nil {
t.Fatalf("failed to create containerd state: %v", err)
}
- cleanups = append(cleanups, func() { os.RemoveAll(containerdState) })
+ cu.Add(func() { os.RemoveAll(containerdState) })
sockAddr := filepath.Join(testutil.TmpDir(), "containerd-test.sock")
// We rewrite a configuration. This is based on the current docker
@@ -305,7 +298,7 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) {
if err != nil {
t.Fatalf("failed to write containerd config")
}
- cleanups = append(cleanups, configCleanup)
+ cu.Add(configCleanup)
// Start containerd.
cmd := exec.Command(getContainerd(),
@@ -321,7 +314,8 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) {
stdout := &bytes.Buffer{}
cmd.Stderr = io.MultiWriter(startupW, stderr)
cmd.Stdout = io.MultiWriter(startupW, stdout)
- cleanups = append(cleanups, func() {
+ cu.Add(func() {
+ // Log output in case of failure.
t.Logf("containerd stdout: %s", stdout.String())
t.Logf("containerd stderr: %s", stderr.String())
})
@@ -338,15 +332,14 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) {
// Kill must be the last cleanup (as it will be executed first).
cc := criutil.NewCrictl(t, sockAddr)
- cleanups = append(cleanups, func() {
+ cu.Add(func() {
cc.CleanUp() // Remove tmp files, etc.
if err := testutil.KillCommand(cmd); err != nil {
log.Printf("error killing containerd: %v", err)
}
})
- cleanup.Release()
- return cc, cleanupFunc, nil
+ return cc, cu.Release(), nil
}
// httpGet GETs the contents of a file served from a pod on port 80.
diff --git a/test/root/oom_score_adj_test.go b/test/root/oom_score_adj_test.go
index 9a3cecd97..4243eb59e 100644
--- a/test/root/oom_score_adj_test.go
+++ b/test/root/oom_score_adj_test.go
@@ -20,6 +20,7 @@ import (
"testing"
specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/test/testutil"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/specutils"
@@ -324,40 +325,26 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
}
func startContainers(t *testing.T, specs []*specs.Spec, ids []string) ([]*container.Container, func(), error) {
- var (
- containers []*container.Container
- cleanups []func()
- )
- cleanups = append(cleanups, func() {
- for _, c := range containers {
- c.Destroy()
- }
- })
- cleanupAll := func() {
- for _, c := range cleanups {
- c()
- }
- }
- localClean := specutils.MakeCleanup(cleanupAll)
- defer localClean.Clean()
+ var containers []*container.Container
// All containers must share the same root.
- rootDir, cleanup, err := testutil.SetupRootDir()
+ rootDir, clean, err := testutil.SetupRootDir()
if err != nil {
t.Fatalf("error creating root dir: %v", err)
}
- cleanups = append(cleanups, cleanup)
+ cu := cleanup.Make(clean)
+ defer cu.Clean()
// Point this to from the configuration.
conf := testutil.TestConfig(t)
conf.RootDir = rootDir
for i, spec := range specs {
- bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
+ bundleDir, clean, err := testutil.SetupBundleDir(spec)
if err != nil {
return nil, nil, fmt.Errorf("error setting up bundle: %v", err)
}
- cleanups = append(cleanups, cleanup)
+ cu.Add(clean)
args := container.Args{
ID: ids[i],
@@ -375,6 +362,5 @@ func startContainers(t *testing.T, specs []*specs.Spec, ids []string) ([]*contai
}
}
- localClean.Release()
- return containers, cleanupAll, nil
+ return containers, cu.Release(), nil
}
diff --git a/test/runner/runner.go b/test/runner/runner.go
index e4f04cd2a..e048e5a9c 100644
--- a/test/runner/runner.go
+++ b/test/runner/runner.go
@@ -204,7 +204,7 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
return
}
log.Warningf("%s: Got signal: %v", name, s)
- done := make(chan bool)
+ done := make(chan bool, 1)
dArgs := append([]string{}, args...)
dArgs = append(dArgs, "-alsologtostderr=true", "debug", "--stacks", id)
go func(dArgs []string) {
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 5acdb8438..f4b5de18d 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -951,6 +951,7 @@ cc_binary(
"//test/util:epoll_util",
"//test/util:file_descriptor",
"//test/util:fs_util",
+ "//test/util:posix_error",
"//test/util:temp_path",
"//test/util:test_main",
"//test/util:test_util",
@@ -1382,7 +1383,7 @@ cc_binary(
srcs = ["partial_bad_buffer.cc"],
linkstatic = 1,
deps = [
- "//test/syscalls/linux:socket_test_util",
+ ":socket_test_util",
"//test/util:file_descriptor",
"//test/util:fs_util",
"@com_google_absl//absl/time",
@@ -3461,7 +3462,7 @@ cc_binary(
deps = [
":socket_test_util",
gtest,
- "//test/syscalls/linux:socket_netlink_route_util",
+ ":socket_netlink_route_util",
"//test/util:capability_util",
"//test/util:file_descriptor",
"//test/util:fs_util",
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 0e13ad190..2306d9cab 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -33,6 +33,7 @@
#include "test/util/epoll_util.h"
#include "test/util/file_descriptor.h"
#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
#include "test/util/temp_path.h"
#include "test/util/test_util.h"
#include "test/util/thread_util.h"
@@ -335,6 +336,11 @@ TEST(Inotify, InotifyFdNotWritable) {
EXPECT_THAT(write(fd.get(), "x", 1), SyscallFailsWithErrno(EBADF));
}
+TEST(Inotify, InitFlags) {
+ EXPECT_THAT(inotify_init1(IN_NONBLOCK | IN_CLOEXEC), SyscallSucceeds());
+ EXPECT_THAT(inotify_init1(12345), SyscallFailsWithErrno(EINVAL));
+}
+
TEST(Inotify, NonBlockingReadReturnsEagain) {
const FileDescriptor fd =
ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
@@ -395,7 +401,7 @@ TEST(Inotify, CanDeleteFileAfterRemovingWatch) {
file1.reset();
}
-TEST(Inotify, CanRemoveWatchAfterDeletingFile) {
+TEST(Inotify, RemoveWatchAfterDeletingFileFails) {
const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
TempPath file1 =
ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
@@ -491,17 +497,23 @@ TEST(Inotify, DeletingChildGeneratesEvents) {
Event(IN_DELETE, root_wd, Basename(file1_path))}));
}
+// Creating a file in "parent/child" should generate events for child, but not
+// parent.
TEST(Inotify, CreatingFileGeneratesEvents) {
- const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath child =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
const FileDescriptor fd =
ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS));
const int wd = ASSERT_NO_ERRNO_AND_VALUE(
- InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS));
// Create a new file in the directory.
const TempPath file1 =
- ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(child.path()));
const std::vector<Event> events =
ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
@@ -554,6 +566,47 @@ TEST(Inotify, WritingFileGeneratesModifyEvent) {
ASSERT_THAT(events, Are({Event(IN_MODIFY, wd, Basename(file1.path()))}));
}
+TEST(Inotify, SizeZeroReadWriteGeneratesNothing) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ // Read from the empty file.
+ int val;
+ ASSERT_THAT(read(file1_fd.get(), &val, sizeof(val)),
+ SyscallSucceedsWithValue(0));
+
+ // Write zero bytes.
+ ASSERT_THAT(write(file1_fd.get(), "", 0), SyscallSucceedsWithValue(0));
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({}));
+}
+
+TEST(Inotify, FailedFileCreationGeneratesNoEvents) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), dir.path(), IN_ALL_EVENTS));
+
+ const char* p = dir.path().c_str();
+ ASSERT_THAT(mkdir(p, 0777), SyscallFails());
+ ASSERT_THAT(mknod(p, S_IFIFO, 0777), SyscallFails());
+ ASSERT_THAT(symlink(p, p), SyscallFails());
+ ASSERT_THAT(link(p, p), SyscallFails());
+ std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({}));
+}
+
TEST(Inotify, WatchSetAfterOpenReportsCloseFdEvent) {
const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
const FileDescriptor fd =
@@ -602,7 +655,7 @@ TEST(Inotify, ChildrenDeletionInWatchedDirGeneratesEvent) {
Event(IN_DELETE | IN_ISDIR, wd, Basename(dir1_path))}));
}
-TEST(Inotify, WatchTargetDeletionGeneratesEvent) {
+TEST(Inotify, RmdirOnWatchedTargetGeneratesEvent) {
const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
const FileDescriptor fd =
ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
@@ -1228,7 +1281,7 @@ TEST(Inotify, LinkGeneratesAttribAndCreateEvents) {
InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
const int rc = link(file1.path().c_str(), link1.path().c_str());
- // link(2) is only supported on tmpfs in the sandbox.
+ // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
(errno == EPERM || errno == ENOENT));
ASSERT_THAT(rc, SyscallSucceeds());
@@ -1322,21 +1375,27 @@ TEST(Inotify, HardlinksReuseSameWatch) {
Event(IN_DELETE, root_wd, Basename(file1_path))}));
}
+// Calling mkdir within "parent/child" should generate an event for child, but
+// not parent.
TEST(Inotify, MkdirGeneratesCreateEventWithDirFlag) {
- const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath child =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
const FileDescriptor fd =
ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
- const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
- InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS));
+ const int child_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS));
- const TempPath dir1(NewTempAbsPathInDir(root.path()));
+ const TempPath dir1(NewTempAbsPathInDir(child.path()));
ASSERT_THAT(mkdir(dir1.path().c_str(), 0777), SyscallSucceeds());
const std::vector<Event> events =
ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
ASSERT_THAT(
events,
- Are({Event(IN_CREATE | IN_ISDIR, root_wd, Basename(dir1.path()))}));
+ Are({Event(IN_CREATE | IN_ISDIR, child_wd, Basename(dir1.path()))}));
}
TEST(Inotify, MultipleInotifyInstancesAndWatchesAllGetEvents) {
@@ -1597,6 +1656,8 @@ TEST(Inotify, EpollNoDeadlock) {
}
TEST(Inotify, SpliceEvent) {
+ // TODO(gvisor.dev/issue/138): Implement splice in VFS2.
+ SKIP_IF(IsRunningOnGvisor() && !IsRunningWithVFS1());
int pipes[2];
ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
@@ -1624,6 +1685,263 @@ TEST(Inotify, SpliceEvent) {
ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)}));
}
+// Watches on a parent should not be triggered by actions on a hard link to one
+// of its children that has a different parent.
+TEST(Inotify, LinkOnOtherParent) {
+ const TempPath dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+ std::string link_path = NewTempAbsPathInDir(dir2.path());
+
+ const int rc = link(file.path().c_str(), link_path.c_str());
+ // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+ SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
+ (errno == EPERM || errno == ENOENT));
+ ASSERT_THAT(rc, SyscallSucceeds());
+
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(inotify_fd.get(), dir1.path(), IN_ALL_EVENTS));
+
+ // Perform various actions on the link outside of dir1, which should trigger
+ // no inotify events.
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(link_path.c_str(), O_RDWR));
+ int val = 0;
+ ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds());
+ ASSERT_THAT(unlink(link_path.c_str()), SyscallSucceeds());
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({}));
+}
+
+TEST(Inotify, Exec) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath bin = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(dir.path(), "/bin/true"));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), bin.path(), IN_ALL_EVENTS));
+
+ // Perform exec.
+ ScopedThread t([&bin]() {
+ ASSERT_THAT(execl(bin.path().c_str(), bin.path().c_str(), (char*)nullptr),
+ SyscallSucceeds());
+ });
+ t.Join();
+
+ std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_OPEN, wd), Event(IN_ACCESS, wd)}));
+}
+
+// Watches without IN_EXCL_UNLINK, should continue to emit events for file
+// descriptors after their corresponding files have been unlinked.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) {
+ const DisableSave ds;
+
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(dir.path(), "123", TempPath::kDefaultFileMode));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(inotify_fd.get(), dir.path(), IN_ALL_EVENTS));
+ const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS));
+
+ ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+ int val = 0;
+ ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({
+ Event(IN_ATTRIB, file_wd),
+ Event(IN_DELETE, dir_wd, Basename(file.path())),
+ Event(IN_ACCESS, dir_wd, Basename(file.path())),
+ Event(IN_ACCESS, file_wd),
+ Event(IN_MODIFY, dir_wd, Basename(file.path())),
+ Event(IN_MODIFY, file_wd),
+ }));
+}
+
+// Watches created with IN_EXCL_UNLINK will stop emitting events on fds for
+// children that have already been unlinked.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlink_NoRandomSave) {
+ const DisableSave ds;
+ // TODO(gvisor.dev/issue/1624): This test fails on VFS1.
+ SKIP_IF(IsRunningWithVFS1());
+
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+ // Unlink the child, which should cause further operations on the open file
+ // descriptor to be ignored.
+ ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+ int val = 0;
+ ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_DELETE, wd, Basename(file.path()))}));
+}
+
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkDirectory_NoRandomSave) {
+ const DisableSave ds;
+
+ const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath dir =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
+ std::string dirPath = dir.path();
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(dirPath.c_str(), O_RDONLY | O_DIRECTORY));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ inotify_fd.get(), parent.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+ // Unlink the dir, and then close the open fd.
+ ASSERT_THAT(rmdir(dirPath.c_str()), SyscallSucceeds());
+ dir.reset();
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ // No close event should appear.
+ ASSERT_THAT(events,
+ Are({Event(IN_DELETE | IN_ISDIR, wd, Basename(dirPath))}));
+}
+
+// If "dir/child" and "dir/child2" are links to the same file, and "dir/child"
+// is unlinked, a watch on "dir" with IN_EXCL_UNLINK will exclude future events
+// for fds on "dir/child" but not "dir/child2".
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkMultipleChildren_NoRandomSave) {
+ const DisableSave ds;
+ // TODO(gvisor.dev/issue/1624): This test fails on VFS1.
+ SKIP_IF(IsRunningWithVFS1());
+
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+ std::string path1 = file.path();
+ std::string path2 = NewTempAbsPathInDir(dir.path());
+
+ const int rc = link(path1.c_str(), path2.c_str());
+ // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+ SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
+ (errno == EPERM || errno == ENOENT));
+ ASSERT_THAT(rc, SyscallSucceeds());
+ const FileDescriptor fd1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(path1.c_str(), O_RDWR));
+ const FileDescriptor fd2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(path2.c_str(), O_RDWR));
+
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+ // After unlinking path1, only events on the fd for path2 should be generated.
+ ASSERT_THAT(unlink(path1.c_str()), SyscallSucceeds());
+ ASSERT_THAT(write(fd1.get(), "x", 1), SyscallSucceeds());
+ ASSERT_THAT(write(fd2.get(), "x", 1), SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({
+ Event(IN_DELETE, wd, Basename(path1)),
+ Event(IN_MODIFY, wd, Basename(path2)),
+ }));
+}
+
+// On native Linux, actions of data type FSNOTIFY_EVENT_INODE are not affected
+// by IN_EXCL_UNLINK (see
+// fs/notify/inotify/inotify_fsnotify.c:inotify_handle_event). Inode-level
+// events include changes to metadata and extended attributes.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) {
+ const DisableSave ds;
+
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR));
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+ // NOTE(b/157163751): Create another link before unlinking. This is needed for
+ // the gofer filesystem in gVisor, where open fds will not work once the link
+ // count hits zero. In VFS2, we end up skipping the gofer test anyway, because
+ // hard links are not supported for gofer fs.
+ if (IsRunningOnGvisor()) {
+ std::string link_path = NewTempAbsPath();
+ const int rc = link(file.path().c_str(), link_path.c_str());
+ // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+ SKIP_IF(rc != 0 && (errno == EPERM || errno == ENOENT));
+ ASSERT_THAT(rc, SyscallSucceeds());
+ }
+
+ // Even after unlinking, inode-level operations will trigger events regardless
+ // of IN_EXCL_UNLINK.
+ ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+
+ // Perform various actions on fd.
+ ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds());
+ std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({
+ Event(IN_DELETE, wd, Basename(file.path())),
+ Event(IN_MODIFY, wd, Basename(file.path())),
+ }));
+
+ struct timeval times[2] = {{1, 0}, {2, 0}};
+ ASSERT_THAT(futimes(fd.get(), times), SyscallSucceeds());
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))}));
+
+ // S/R is disabled on this entire test due to behavior with unlink; it must
+ // also be disabled after this point because of fchmod.
+ ASSERT_THAT(fchmod(fd.get(), 0777), SyscallSucceeds());
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))}));
+}
+
} // namespace
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index f103e2e56..08fc4b1b7 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -430,6 +430,55 @@ TEST(SpliceTest, TwoPipes) {
EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
}
+TEST(SpliceTest, TwoPipesCircular) {
+ // This test deadlocks the sentry on VFS1 because VFS1 splice ordering is
+ // based on fs.File.UniqueID, which does not prevent circular ordering between
+ // e.g. inode-level locks taken by fs.FileOperations.
+ SKIP_IF(IsRunningWithVFS1());
+
+ // Create two pipes.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor first_rfd(fds[0]);
+ const FileDescriptor first_wfd(fds[1]);
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor second_rfd(fds[0]);
+ const FileDescriptor second_wfd(fds[1]);
+
+ // On Linux, each pipe is normally limited to
+ // include/linux/pipe_fs_i.h:PIPE_DEF_BUFFERS buffers worth of data.
+ constexpr size_t PIPE_DEF_BUFFERS = 16;
+
+ // Write some data to each pipe. Below we splice 1 byte at a time between
+ // pipes, which very quickly causes each byte to be stored in a separate
+ // buffer, so we must ensure that the total amount of data in the system is <=
+ // PIPE_DEF_BUFFERS bytes.
+ std::vector<char> buf(PIPE_DEF_BUFFERS / 2);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(first_wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+ ASSERT_THAT(write(second_wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ // Have another thread splice from the second pipe to the first, while we
+ // splice from the first to the second. The test passes if this does not
+ // deadlock.
+ const int kIterations = 1000;
+ DisableSave ds;
+ ScopedThread t([&]() {
+ for (int i = 0; i < kIterations; i++) {
+ ASSERT_THAT(
+ splice(second_rfd.get(), nullptr, first_wfd.get(), nullptr, 1, 0),
+ SyscallSucceedsWithValue(1));
+ }
+ });
+ for (int i = 0; i < kIterations; i++) {
+ ASSERT_THAT(
+ splice(first_rfd.get(), nullptr, second_wfd.get(), nullptr, 1, 0),
+ SyscallSucceedsWithValue(1));
+ }
+}
+
TEST(SpliceTest, Blocking) {
// Create two new pipes.
int first[2], second[2];
diff --git a/tools/bazel.mk b/tools/bazel.mk
index 7cb6e393b..b774c8fab 100644
--- a/tools/bazel.mk
+++ b/tools/bazel.mk
@@ -21,7 +21,8 @@ BRANCH_NAME := $(shell (git branch --show-current 2>/dev/null || \
# Bazel container configuration (see below).
USER ?= gvisor
-DOCKER_NAME ?= gvisor-bazel-$(shell readlink -m $(CURDIR) | md5sum | cut -c1-8)
+HASH ?= $(shell readlink -m $(CURDIR) | md5sum | cut -c1-8)
+DOCKER_NAME ?= gvisor-bazel-$(HASH)
DOCKER_PRIVILEGED ?= --privileged
BAZEL_CACHE := $(shell readlink -m ~/.cache/bazel/)
GCLOUD_CONFIG := $(shell readlink -m ~/.config/gcloud/)
@@ -40,6 +41,7 @@ FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_SOCKET):$(DOCKER_SOCKET)"
DOCKER_GROUP := $(shell stat -c '%g' $(DOCKER_SOCKET))
ifneq ($(GID),$(DOCKER_GROUP))
USERADD_OPTIONS += --groups $(DOCKER_GROUP)
+GROUPADD_DOCKER += groupadd --gid $(DOCKER_GROUP) --non-unique docker-$(HASH) &&
FULL_DOCKER_RUN_OPTIONS += --group-add $(DOCKER_GROUP)
endif
endif
@@ -71,10 +73,12 @@ bazel-server-start: load-default ## Starts the bazel server.
$(FULL_DOCKER_RUN_OPTIONS) \
gvisor.dev/images/default \
sh -c "groupadd --gid $(GID) --non-unique $(USER) && \
+ $(GROUPADD_DOCKER) \
useradd --uid $(UID) --non-unique --no-create-home --gid $(GID) $(USERADD_OPTIONS) -d $(HOME) $(USER) && \
bazel version && \
exec tail --pid=\$$(bazel info server_pid) -f /dev/null"
- @while :; do if docker logs $(DOCKER_NAME) 2>/dev/null | grep "Build label:" >/dev/null; then break; fi; sleep 1; done
+ @while :; do if docker logs $(DOCKER_NAME) 2>/dev/null | grep "Build label:" >/dev/null; then break; fi; \
+ if ! docker ps | grep $(DOCKER_NAME); then exit 1; else sleep 1; fi; done
.PHONY: bazel-server-start
bazel-shutdown: ## Shuts down a running bazel server.
diff --git a/tools/go_branch.sh b/tools/go_branch.sh
index e568a0a76..093de89b4 100755
--- a/tools/go_branch.sh
+++ b/tools/go_branch.sh
@@ -88,6 +88,12 @@ EOF
# because they may correspond to unused templates, etc.
cp "${repo_orig}"/runsc/*.go runsc/
+# Normalize all permissions. The way bazel constructs the :gopath tree may leave
+# some strange permissions on files. We don't have anything in this tree that
+# should be execution, only the Go source files, README.md, and ${othersrc}.
+find . -type f -exec chmod 0644 {} \;
+find . -type d -exec chmod 0755 {} \;
+
# Update the current working set and commit.
git add . && git commit -m "Merge ${head} (automated)"
diff --git a/tools/make_repository.sh b/tools/make_apt.sh
index 32d7b3b1f..3fb1066e5 100755
--- a/tools/make_repository.sh
+++ b/tools/make_apt.sh
@@ -14,22 +14,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-# We need to be sure that only a repo path is printed on stdout.
-exec 50<&1
-exec 1<&2
-
-echo_stdout() {
- echo "$@" >&50
-}
-
-# Parse arguments. We require more than two arguments, which are the private
-# keyring, the e-mail associated with the signer, and the list of packages.
-if [ "$#" -le 3 ]; then
- echo "usage: $0 <private-key> <signer-email> <root> <packages...>"
+if [[ "$#" -le 3 ]]; then
+ echo "usage: $0 <private-key> <suite> <root> <packages...>"
exit 1
fi
declare -r private_key=$(readlink -e "$1"); shift
-declare -r signer="$1"; shift
+declare -r suite="$1"; shift
declare -r root="$1"; shift
# Ensure that we have the correct packages installed.
@@ -52,16 +42,16 @@ function apt_install() {
esac
done
}
-dpkg-sig --help >/dev/null || apt_install dpkg-sig
-apt-ftparchive --help >/dev/null || apt_install apt-utils
-xz --help >/dev/null || apt_install xz-utils
+dpkg-sig --help >/dev/null 2>&1 || apt_install dpkg-sig
+apt-ftparchive --help >/dev/null 2>&1 || apt_install apt-utils
+xz --help >/dev/null 2>&1 || apt_install xz-utils
# Verbose from this point.
set -xeo pipefail
-# Create a temporary working directory. We don't remove this, as we ultimately
-# print this result and allow the caller to copy wherever they would like.
-declare -r tmpdir=$(mktemp -d /tmp/repoXXXXXX)
+# Create a directory for the release.
+declare -r release="${root}/dists/${suite}"
+mkdir -p "${release}"
# Create a temporary keyring, and ensure it is cleaned up.
declare -r keyring=$(mktemp /tmp/keyringXXXXXX.gpg)
@@ -69,12 +59,18 @@ cleanup() {
rm -f "${keyring}"
}
trap cleanup EXIT
-gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}"
+
+# We attempt the import twice because the first one will fail if the public key
+# is not found. This isn't actually a failure for us, because we don't require
+# the public (this may be stored separately). The second import will succeed
+# because, in reality, the first import succeeded and it's a no-op.
+gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}" || \
+ gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}"
# Copy the packages into the root.
for pkg in "$@"; do
- name=$(basename "${pkg}" .deb)
- name=$(basename "${name}" .changes)
+ ext=${pkg##*.}
+ name=$(basename "${pkg}" ".${ext}")
arch=${name##*_}
if [[ "${name}" == "${arch}" ]]; then
continue # Not a regular package.
@@ -90,17 +86,22 @@ for pkg in "$@"; do
echo "Unknown file type: ${pkg}"
exit 1
fi
- version=${version// /} # Trim whitespace.
- mkdir -p "${root}"/pool/"${version}"/binary-"${arch}"
- cp -a "${pkg}" "${root}"/pool/"${version}"/binary-"${arch}"
-done
-# Ensure all permissions are correct.
-find "${root}"/pool -type f -exec chmod 0644 {} \;
+ # The package may already exist, in which case we leave it alone.
+ version=${version// /} # Trim whitespace.
+ destdir="${root}/pool/${version}/binary-${arch}"
+ target="${destdir}/${name}.${ext}"
+ if [[ -f "${target}" ]]; then
+ continue
+ fi
-# Sign all packages.
-for file in "${root}"/pool/*/binary-*/*.deb; do
- dpkg-sig -g "--no-default-keyring --keyring ${keyring}" --sign builder "${file}"
+ # Copy & sign the package.
+ mkdir -p "${destdir}"
+ cp -a "${pkg}" "${target}"
+ chmod 0644 "${target}"
+ if [[ "${ext}" == "deb" ]]; then
+ dpkg-sig -g "--no-default-keyring --keyring ${keyring}" --sign builder "${target}"
+ fi
done
# Build the package list.
@@ -109,7 +110,7 @@ for dir in "${root}"/pool/*/binary-*; do
name=$(basename "${dir}")
arch=${name##binary-}
arches+=("${arch}")
- repo_packages="${tmpdir}"/main/"${name}"
+ repo_packages="${release}"/main/"${name}"
mkdir -p "${repo_packages}"
(cd "${root}" && apt-ftparchive --arch "${arch}" packages pool > "${repo_packages}"/Packages)
(cd "${repo_packages}" && cat Packages | gzip > Packages.gz)
@@ -117,23 +118,22 @@ for dir in "${root}"/pool/*/binary-*; do
done
# Build the release list.
-cat > "${tmpdir}"/apt.conf <<EOF
+cat > "${release}"/apt.conf <<EOF
APT {
FTPArchive {
Release {
Architectures "${arches[@]}";
+ Suite "${suite}";
Components "main";
};
};
};
EOF
-(cd "${tmpdir}" && apt-ftparchive -c=apt.conf release . > Release)
-rm "${tmpdir}"/apt.conf
+(cd "${release}" && apt-ftparchive -c=apt.conf release . > Release)
+rm "${release}"/apt.conf
# Sign the release.
declare -r digest_opts=("--digest-algo" "SHA512" "--cert-digest-algo" "SHA512")
-(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" --clearsign "${digest_opts[@]}" -o InRelease Release)
-(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" -abs "${digest_opts[@]}" -o Release.gpg Release)
-
-# Show the results.
-echo_stdout "${tmpdir}"
+(cd "${release}" && rm -f Release.gpg InRelease)
+(cd "${release}" && gpg --no-default-keyring --keyring "${keyring}" --clearsign "${digest_opts[@]}" -o InRelease Release)
+(cd "${release}" && gpg --no-default-keyring --keyring "${keyring}" -abs "${digest_opts[@]}" -o Release.gpg Release)
diff --git a/tools/make_release.sh b/tools/make_release.sh
new file mode 100755
index 000000000..b1cdd47b0
--- /dev/null
+++ b/tools/make_release.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [[ "$#" -le 2 ]]; then
+ echo "usage: $0 <private-key> <root> <binaries & packages...>"
+ echo "The environment variable NIGHTLY may be set to control"
+ echo "whether the nightly packages are produced or not."
+ exit 1
+fi
+
+set -xeo pipefail
+declare -r private_key="$1"; shift
+declare -r root="$1"; shift
+declare -a binaries
+declare -a pkgs
+
+# Collect binaries & packages.
+for arg in "$@"; do
+ if [[ "${arg}" == *.deb ]] || [[ "${arg}" == *.changes ]]; then
+ pkgs+=("${arg}")
+ else
+ binaries+=("${arg}")
+ fi
+done
+
+# install_raw installs raw artifacts.
+install_raw() {
+ mkdir -p "${root}/$1"
+ for binary in "${binaries[@]}"; do
+ # Copy the raw file & generate a sha512sum.
+ name=$(basename "${binary}")
+ cp -f "${binary}" "${root}/$1"
+ sha512sum "${root}/$1/${name}" | \
+ awk "{print $$1 \" ${name}\"}" > "${root}/$1/${name}.sha512"
+ done
+}
+
+# install_apt installs an apt repository.
+install_apt() {
+ tools/make_apt.sh "${private_key}" "$1" "${root}" "${pkgs[@]}"
+}
+
+# If nightly, install only nightly artifacts.
+if [[ "${NIGHTLY:-false}" == "true" ]]; then
+ # The "latest" directory and current date.
+ stamp="$(date -Idate)"
+ install_raw "nightly/latest"
+ install_raw "nightly/${stamp}"
+ install_apt "nightly"
+else
+ # Is it a tagged release? Build that.
+ tags="$(git tag --points-at HEAD 2>/dev/null || true)"
+ if ! [[ -z "${tags}" ]]; then
+ # Note that a given commit can match any number of tags. We have to iterate
+ # through all possible tags and produce associated artifacts.
+ for tag in ${tags}; do
+ name=$(echo "${tag}" | cut -d'-' -f2)
+ base=$(echo "${name}" | cut -d'.' -f1)
+ install_raw "release/${name}"
+ install_raw "release/latest"
+ install_apt "release"
+ install_apt "${base}"
+ done
+ else
+ # Otherwise, assume it is a raw master commit.
+ install_raw "master/latest"
+ install_apt "master"
+ fi
+fi
diff --git a/website/_config.yml b/website/_config.yml
index 3241e458c..b08602970 100644
--- a/website/_config.yml
+++ b/website/_config.yml
@@ -12,6 +12,7 @@ plugins:
- jekyll-inline-svg
- jekyll-relative-links
- jekyll-feed
+ - jekyll-sitemap
site_url: https://gvisor.dev
feed:
path: blog/index.xml