diff options
294 files changed, 7481 insertions, 2305 deletions
@@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +# RBE requires a strong hash function, such as SHA256. +startup --host_jvm_args=-Dbazel.DigestFunction=SHA256 + # Build with C++17. build --cxxopt=-std=c++17 @@ -22,11 +25,17 @@ build --stamp --workspace_status_command tools/workspace_status.sh build:remote --remote_executor=grpcs://remotebuildexecution.googleapis.com build:remote --project_id=gvisor-rbe build:remote --remote_instance_name=projects/gvisor-rbe/instances/default_instance +build:remote3 --remote_executor=grpcs://remotebuildexecution.googleapis.com +build:remote3 --project_id=gvisor-rbe +build:remote3 --remote_instance_name=projects/gvisor-rbe/instances/default_instance + # Enable authentication. This will pick up application default credentials by # default. You can use --google_credentials=some_file.json to use a service # account credential instead. build:remote --google_default_credentials=true build:remote --auth_scope="https://www.googleapis.com/auth/cloud-source-tools" +build:remote3 --google_default_credentials=true +build:remote3 --auth_scope="https://www.googleapis.com/auth/cloud-source-tools" # Add a custom platform and toolchain that builds in a privileged docker # container, which is required by our syscall tests. @@ -37,8 +46,13 @@ build:remote --platforms=//tools/bazeldefs:rbe_ubuntu1604 build:remote --crosstool_top=@rbe_default//cc:toolchain build:remote --jobs=50 build:remote --remote_timeout=3600 -# RBE requires a strong hash function, such as SHA256. -startup --host_jvm_args=-Dbazel.DigestFunction=SHA256 +build:remote3 --host_platform=//tools/bazeldefs:rbe_ubuntu1604_bazel3 +build:remote3 --extra_toolchains=//tools/bazeldefs:cc-toolchain-clang-x86_64-default_bazel3 +build:remote3 --extra_execution_platforms=//tools/bazeldefs:rbe_ubuntu1604_bazel3 +build:remote3 --platforms=//tools/bazeldefs:rbe_ubuntu1604_bazel3 +build:remote3 --crosstool_top=@rbe_default//cc:toolchain +build:remote3 --jobs=50 +build:remote3 --remote_timeout=3600 # Set flags for uploading to BES in order to view results in the Bazel Build # Results UI. diff --git a/.travis.yml b/.travis.yml index 9d3141f38..1d955b05d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,8 +30,10 @@ services: - docker jobs: include: - - os: linux - arch: amd64 + # AMD64 builds are tested on kokoro, so don't run them in travis to save + # capacity for arm64 builds. + # - os: linux + # arch: amd64 - os: linux arch: arm64 script: @@ -14,6 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Helpful pretty-printer. +MAKEBANNER := \033[1;34mmake\033[0m +submake = echo -e '$(MAKEBANNER) $1' >&2; $(MAKE) $1 + # Described below. OPTIONS := STARTUP_OPTIONS := @@ -85,7 +89,7 @@ endif ## define images $(1)-%: ## Image tool: $(1) a given image (also may use 'all-images'). - @$(MAKE) -C images $$@ + @$(call submake,-C images $$@) endef rebuild-...: ## Rebuild the given image. Also may use 'rebuild-all-images'. $(eval $(call images,rebuild)) @@ -96,7 +100,7 @@ $(eval $(call images,push)) load-...: ## Load (pull or rebuild) the given image. Also may use 'load-all-images'. $(eval $(call images,load)) list-images: ## List all available images. - @$(MAKE) -C images $$@ + @$(call submake, -C images $$@) ## ## Canonical build and test targets. @@ -106,21 +110,116 @@ list-images: ## List all available images. ## new subsystem or workflow, consider adding a new target here. ## runsc: ## Builds the runsc binary. - @$(MAKE) build TARGETS="//runsc" + @$(call submake,build OPTIONS="-c opt" TARGETS="//runsc") .PHONY: runsc +debian: ## Builds the debian packages. + @$(call submake,build OPTIONS="-c opt" TARGETS="//runsc:runsc-debian") +.PHONY: debian + smoke-test: ## Runs a simple smoke test after build runsc. - @$(MAKE) run DOCKER_PRIVILEGED="" ARGS="--alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do true" + @$(call submake,run DOCKER_PRIVILEGED="" ARGS="--alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do true") .PHONY: smoke-tests -unit-tests: ## Runs all unit tests in pkg runsc and tools. - @$(MAKE) test OPTIONS="pkg/... runsc/... tools/..." -.PHONY: unit-tests +unit-tests: ## Local package unit tests in pkg/..., runsc/, tools/.., etc. + @$(call submake,test TARGETS="pkg/... runsc/... tools/... benchmarks/... benchmarks/runner:runner_test") -tests: ## Runs all local ptrace system call tests. - @$(MAKE) test OPTIONS="--test_tag_filters runsc_ptrace test/syscalls/..." +tests: ## Runs all unit tests and syscall tests. +tests: unit-tests + @$(call submake,test TARGETS="test/syscalls/...") .PHONY: tests + +integration-tests: ## Run all standard integration tests. +integration-tests: docker-tests overlay-tests hostnet-tests swgso-tests +integration-tests: do-tests kvm-tests root-tests containerd-tests +.PHONY: integration-tests + +network-tests: ## Run all networking integration tests. +network-tests: iptables-tests packetdrill-tests packetimpact-tests +.PHONY: network-tests + +# Standard integration targets. +INTEGRATION_TARGETS := //test/image:image_test //test/e2e:integration_test + +syscall-%-tests: + @$(call submake,test OPTIONS="--test_tag_filters runsc_$* test/syscalls/...") + +syscall-native-tests: + @$(call submake,test OPTIONS="--test_tag_filters native test/syscalls/...") +.PHONY: syscall-native-tests + +syscall-tests: ## Run all system call tests. +syscall-tests: syscall-ptrace-tests syscall-kvm-tests syscall-native-tests +.PHONY: syscall-tests + +%-runtime-tests: load-runtimes_% + @$(call submake,install-test-runtime) + @$(call submake,test-runtime TARGETS="//test/runtimes:$*") + +do-tests: runsc + @$(call submake,run TARGETS="//runsc" ARGS="--rootless do true") + @$(call submake,run TARGETS="//runsc" ARGS="--rootless -network=none do true") + @$(call submake,sudo TARGETS="//runsc" ARGS="do true") +.PHONY: do-tests + +simple-tests: unit-tests # Compatibility target. +.PHONY: simple-tests + +IMAGE_FILTER := HelloWorld\|Httpd\|Ruby\|Stdio +INTEGRATION_FILTER := Life\|Pause\|Connect\|JobControl\|Overlay\|Exec\|DirCreation/root + +docker-tests: load-basic-images + @$(call submake,install-test-runtime RUNTIME="vfs1") + @$(call submake,test-runtime RUNTIME="vfs1" TARGETS="$(INTEGRATION_TARGETS)") + @$(call submake,install-test-runtime RUNTIME="vfs2" ARGS="--vfs2") + @$(call submake,test-runtime RUNTIME="vfs2" OPTIONS="--test_filter=$(IMAGE_FILTER)\|$(INTEGRATION_FILTER)" TARGETS="$(INTEGRATION_TARGETS)") +.PHONY: docker-tests + +overlay-tests: load-basic-images + @$(call submake,install-test-runtime RUNTIME="overlay" ARGS="--overlay") + @$(call submake,test-runtime RUNTIME="overlay" TARGETS="$(INTEGRATION_TARGETS)") +.PHONY: overlay-tests + +swgso-tests: load-basic-images + @$(call submake,install-test-runtime RUNTIME="swgso" ARGS="--software-gso=true --gso=false") + @$(call submake,test-runtime RUNTIME="swgso" TARGETS="$(INTEGRATION_TARGETS)") +.PHONY: swgso-tests + +hostnet-tests: load-basic-images + @$(call submake,install-test-runtime RUNTIME="hostnet" ARGS="--network=host") + @$(call submake,test-runtime RUNTIME="hostnet" OPTIONS="--test_arg=-checkpoint=false" TARGETS="$(INTEGRATION_TARGETS)") +.PHONY: hostnet-tests + +kvm-tests: load-basic-images + @(lsmod | grep -E '^(kvm_intel|kvm_amd)') || sudo modprobe kvm + @if ! [[ -w /dev/kvm ]]; then sudo chmod a+rw /dev/kvm; fi + @$(call submake,test TARGETS="//pkg/sentry/platform/kvm:kvm_test") + @$(call submake,install-test-runtime RUNTIME="kvm" ARGS="--platform=kvm") + @$(call submake,test-runtime RUNTIME="kvm" TARGETS="$(INTEGRATION_TARGETS)") +.PHONY: kvm-tests + +iptables-tests: load-iptables + @$(call submake,test-runtime RUNTIME="runc" TARGETS="//test/iptables:iptables_test") + @$(call submake,install-test-runtime RUNTIME="iptables" ARGS="--net-raw") + @$(call submake,test-runtime RUNTIME="iptables" TARGETS="//test/iptables:iptables_test") +.PHONY: iptables-tests + +packetdrill-tests: load-packetdrill + @$(call submake,install-test-runtime RUNTIME="packetdrill") + @$(call submake,test-runtime RUNTIME="packetdrill" TARGETS="$(shell $(MAKE) query TARGETS='attr(tags, packetdrill, tests(//...))')") +.PHONY: packetdrill-tests + +packetimpact-tests: load-packetimpact + @$(call submake,install-test-runtime RUNTIME="packetimpact") + @$(call submake,test-runtime RUNTIME="packetimpact" TARGETS="$(shell $(MAKE) query TARGETS='attr(tags, packetimpact, tests(//...))')") +.PHONY: packetimpact-tests + +root-tests: load-basic-images + @$(call submake,install-test-runtime) + @$(call submake,sudo TARGETS="//test/root:root_test" ARGS="-test.v") +.PHONY: test-root + # Specific containerd version tests. containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd install-test-runtime @CONTAINERD_VERSION=$* $(MAKE) sudo TARGETS="tools/installers:containerd" @@ -154,7 +253,7 @@ WEBSITE_PROJECT := gvisordev WEBSITE_REGION := us-central1 website-build: load-jekyll ## Build the site image locally. - @$(MAKE) run TARGETS="//website:website" + @$(call submake,run TARGETS="//website:website") .PHONY: website-build website-server: website-build ## Run a local server for development. @@ -205,8 +304,8 @@ $(RELEASE_KEY): release: $(RELEASE_KEY) ## Builds a release. @mkdir -p $(RELEASE_ROOT) @T=$$(mktemp -d /tmp/release.XXXXXX); \ - $(MAKE) copy TARGETS="runsc" DESTINATION=$$T && \ - $(MAKE) copy TARGETS="runsc:runsc-debian" DESTINATION=$$T && \ + $(call submake,copy TARGETS="runsc" DESTINATION=$$T) && \ + $(call submake,copy TARGETS="runsc:runsc-debian" DESTINATION=$$T) && \ NIGHTLY=$(RELEASE_NIGHTLY) tools/make_release.sh $(RELEASE_KEY) $(RELEASE_ROOT) $$T/*; \ rc=$$?; rm -rf $$T; exit $$rc .PHONY: release @@ -229,43 +328,47 @@ tag: ## Creates and pushes a release tag. ## ifeq (,$(BRANCH_NAME)) RUNTIME := runsc -RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/runsc +RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/$(RUNTIME) else RUNTIME := $(BRANCH_NAME) -RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/$(BRANCH_NAME) +RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/$(RUNTIME) endif RUNTIME_BIN := $(RUNTIME_DIR)/runsc RUNTIME_LOG_DIR := $(RUNTIME_DIR)/logs RUNTIME_LOGS := $(RUNTIME_LOG_DIR)/runsc.log.%TEST%.%TIMESTAMP%.%COMMAND% dev: ## Installs a set of local runtimes. Requires sudo. - @$(MAKE) refresh ARGS="--net-raw" - @$(MAKE) configure RUNTIME="$(RUNTIME)" ARGS="--net-raw" - @$(MAKE) configure RUNTIME="$(RUNTIME)-d" ARGS="--net-raw --debug --strace --log-packets" - @$(MAKE) configure RUNTIME="$(RUNTIME)-p" ARGS="--net-raw --profile" - @$(MAKE) configure RUNTIME="$(RUNTIME)-vfs2-d" ARGS="--net-raw --debug --strace --log-packets --vfs2" + @$(call submake,refresh ARGS="--net-raw") + @$(call submake,configure RUNTIME_NAME="$(RUNTIME)" ARGS="--net-raw") + @$(call submake,configure RUNTIME_NAME="$(RUNTIME)-d" ARGS="--net-raw --debug --strace --log-packets") + @$(call submake,configure RUNTIME_NAME="$(RUNTIME)-p" ARGS="--net-raw --profile") + @$(call submake,configure RUNTIME_NAME="$(RUNTIME)-vfs2-d" ARGS="--net-raw --debug --strace --log-packets --vfs2") @sudo systemctl restart docker .PHONY: dev -refresh: ## Refreshes the runtime binary (for development only). Must have called 'dev' or 'test-install' first. +refresh: ## Refreshes the runtime binary (for development only). Must have called 'dev' or 'install-test-runtime' first. @mkdir -p "$(RUNTIME_DIR)" - @$(MAKE) copy TARGETS=runsc DESTINATION="$(RUNTIME_BIN)" + @$(call submake,copy TARGETS=runsc DESTINATION="$(RUNTIME_BIN)") .PHONY: install install-test-runtime: ## Installs the runtime for testing. Requires sudo. - @$(MAKE) refresh ARGS="--net-raw --TESTONLY-test-name-env=RUNSC_TEST_NAME --debug --strace --log-packets $(ARGS)" - @$(MAKE) configure RUNTIME=runsc - @$(MAKE) configure + @$(call submake,refresh ARGS="--net-raw --TESTONLY-test-name-env=RUNSC_TEST_NAME --debug --strace --log-packets $(ARGS)") + @$(call submake,configure RUNTIME_NAME=runsc) + @$(call submake,configure RUNTIME_NAME="$(RUNTIME)") @sudo systemctl restart docker + @if [[ -f /etc/docker/daemon.json ]]; then \ + sudo chmod 0755 /etc/docker && \ + sudo chmod 0644 /etc/docker/daemon.json; \ + fi .PHONY: install-test-runtime -configure: ## Configures a single runtime. Requires sudo. Typically called from dev or test-install. - @sudo sudo "$(RUNTIME_BIN)" install --experimental=true --runtime="$(RUNTIME)" -- --debug-log "$(RUNTIME_LOGS)" $(ARGS) - @echo "Installed runtime \"$(RUNTIME)\" @ $(RUNTIME_BIN)" - @echo "Logs are in: $(RUNTIME_LOG_DIR)" +configure: ## Configures a single runtime. Requires sudo. Typically called from dev or install-test-runtime. + @sudo sudo "$(RUNTIME_BIN)" install --experimental=true --runtime="$(RUNTIME_NAME)" -- --debug-log "$(RUNTIME_LOGS)" $(ARGS) + @echo -e "$(INFO) Installed runtime \"$(RUNTIME)\" @ $(RUNTIME_BIN)" + @echo -e "$(INFO) Logs are in: $(RUNTIME_LOG_DIR)" @sudo rm -rf "$(RUNTIME_LOG_DIR)" && mkdir -p "$(RUNTIME_LOG_DIR)" .PHONY: configure test-runtime: ## A convenient wrapper around test that provides the runtime argument. Target must still be provided. - @$(MAKE) test OPTIONS="$(OPTIONS) --test_arg=--runtime=$(RUNTIME)" -.PHONY: runtime-test + @$(call submake,test OPTIONS="$(OPTIONS) --test_output=streamed --test_arg=--runtime=$(RUNTIME)") +.PHONY: test-runtime @@ -42,6 +42,28 @@ http_archive( ], ) +http_archive( + name = "io_bazel_rules_go_bazel3", # To replace the above. + patch_args = ["-p1"], + patches = [ + "//tools/nogo:io_bazel_rules_go-visibility.patch", + ], + sha256 = "87f0fb9747854cb76a0a82430adccb6269f7d394237104a4523b51061c469171", + urls = [ + "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.23.1/rules_go-v0.23.1.tar.gz", + "https://github.com/bazelbuild/rules_go/releases/download/v0.23.1/rules_go-v0.23.1.tar.gz", + ], +) + +http_archive( + name = "bazel_gazelle_bazel3", # To replace the above. + sha256 = "bfd86b3cbe855d6c16c6fce60d76bd51f5c8dbc9cfcaef7a2bb5c1aafd0710e8", + urls = [ + "https://mirror.bazel.build/github.com/bazelbuild/bazel-gazelle/releases/download/v0.21.0/bazel-gazelle-v0.21.0.tar.gz", + "https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.21.0/bazel-gazelle-v0.21.0.tar.gz", + ], +) + load("@io_bazel_rules_go//go:deps.bzl", "go_register_toolchains", "go_rules_dependencies") go_rules_dependencies() @@ -123,6 +145,16 @@ http_archive( ], ) +http_archive( + name = "bazel_toolchains_bazel3", # To replace the above. + sha256 = "144290c4166bd67e76a54f96cd504ed86416ca3ca82030282760f0823c10be48", + strip_prefix = "bazel-toolchains-3.1.1", + urls = [ + "https://github.com/bazelbuild/bazel-toolchains/releases/download/3.1.1/bazel-toolchains-3.1.1.tar.gz", + "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/releases/download/3.1.1/bazel-toolchains-3.1.1.tar.gz", + ], +) + # Creates a default toolchain config for RBE. load("@bazel_toolchains//rules:rbe_repo.bzl", "rbe_autoconfig") diff --git a/images/Makefile b/images/Makefile index 1485607bd..9de359a28 100644 --- a/images/Makefile +++ b/images/Makefile @@ -34,8 +34,15 @@ list-all-images: @for image in $(ALL_IMAGES); do echo $${image}; done .PHONY: list-build-images +# Handy wrapper to allow load-all-images, push-all-images, etc. %-all-images: @$(MAKE) $(patsubst %,$*-%,$(ALL_IMAGES)) +load-all-images: + @$(MAKE) $(patsubst %,load-%,$(ALL_IMAGES)) + +# Handy wrapper to load specified "groups", e.g. load-basic-images, etc. +load-%-images: + @$(MAKE) $(patsubst %,load-%,$(subst /,_,$(subst ./,,$(shell find ./$* -name Dockerfile -exec dirname {} \;)))) # tag is a function that returns the tag name, given an image. # diff --git a/images/hostoverlaytest/Dockerfile b/images/basic/hostoverlaytest/Dockerfile index d83439e9c..d83439e9c 100644 --- a/images/hostoverlaytest/Dockerfile +++ b/images/basic/hostoverlaytest/Dockerfile diff --git a/images/hostoverlaytest/test.c b/images/basic/hostoverlaytest/test.c index 088f90746..088f90746 100644 --- a/images/hostoverlaytest/test.c +++ b/images/basic/hostoverlaytest/test.c diff --git a/images/hostoverlaytest/testfile.txt b/images/basic/hostoverlaytest/testfile.txt index e4188c841..e4188c841 100644 --- a/images/hostoverlaytest/testfile.txt +++ b/images/basic/hostoverlaytest/testfile.txt diff --git a/images/tmpfile/Dockerfile b/images/basic/tmpfile/Dockerfile index e3816c8cb..e3816c8cb 100644 --- a/images/tmpfile/Dockerfile +++ b/images/basic/tmpfile/Dockerfile diff --git a/images/benchmarks/ffmpeg/Dockerfile b/images/benchmarks/ffmpeg/Dockerfile new file mode 100644 index 000000000..7108df64f --- /dev/null +++ b/images/benchmarks/ffmpeg/Dockerfile @@ -0,0 +1,9 @@ +FROM ubuntu:18.04 + +RUN set -x \ + && apt-get update \ + && apt-get install -y \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* +WORKDIR /media +ADD https://samples.ffmpeg.org/MPEG-4/video.mp4 video.mp4 diff --git a/images/benchmarks/redis/Dockerfile b/images/benchmarks/redis/Dockerfile new file mode 100644 index 000000000..0f17249af --- /dev/null +++ b/images/benchmarks/redis/Dockerfile @@ -0,0 +1 @@ +FROM redis:5.0.4 diff --git a/images/default/Dockerfile b/images/default/Dockerfile index 397082b02..2b38e6c58 100644 --- a/images/default/Dockerfile +++ b/images/default/Dockerfile @@ -1,7 +1,7 @@ FROM fedora:31 # Install bazel. RUN dnf install -y dnf-plugins-core && dnf copr enable -y vbatts/bazel -RUN dnf install -y git gcc make golang gcc-c++ glibc-devel python3 which python3-pip python3-devel libffi-devel openssl-devel pkg-config glibc-static libstdc++-static patch +RUN dnf install -y git gcc make golang gcc-c++ glibc-devel python3 which python3-pip python3-devel libffi-devel openssl-devel pkg-config glibc-static libstdc++-static patch diffutils RUN pip install pycparser RUN dnf install -y bazel3 # Install gcloud. diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 2b789c4ec..05ca5342f 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -29,6 +29,7 @@ go_library( "file_amd64.go", "file_arm64.go", "fs.go", + "fuse.go", "futex.go", "inotify.go", "ioctl.go", @@ -72,6 +73,9 @@ go_library( "//pkg/abi", "//pkg/binary", "//pkg/bits", + "//pkg/usermem", + "//tools/go_marshal/marshal", + "//tools/go_marshal/primitive", ], ) diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go new file mode 100644 index 000000000..d3ebbccc4 --- /dev/null +++ b/pkg/abi/linux/fuse.go @@ -0,0 +1,143 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// +marshal +type FUSEOpcode uint32 + +// +marshal +type FUSEOpID uint64 + +// Opcodes for FUSE operations. Analogous to the opcodes in include/linux/fuse.h. +const ( + FUSE_LOOKUP FUSEOpcode = 1 + FUSE_FORGET = 2 /* no reply */ + FUSE_GETATTR = 3 + FUSE_SETATTR = 4 + FUSE_READLINK = 5 + FUSE_SYMLINK = 6 + _ + FUSE_MKNOD = 8 + FUSE_MKDIR = 9 + FUSE_UNLINK = 10 + FUSE_RMDIR = 11 + FUSE_RENAME = 12 + FUSE_LINK = 13 + FUSE_OPEN = 14 + FUSE_READ = 15 + FUSE_WRITE = 16 + FUSE_STATFS = 17 + FUSE_RELEASE = 18 + _ + FUSE_FSYNC = 20 + FUSE_SETXATTR = 21 + FUSE_GETXATTR = 22 + FUSE_LISTXATTR = 23 + FUSE_REMOVEXATTR = 24 + FUSE_FLUSH = 25 + FUSE_INIT = 26 + FUSE_OPENDIR = 27 + FUSE_READDIR = 28 + FUSE_RELEASEDIR = 29 + FUSE_FSYNCDIR = 30 + FUSE_GETLK = 31 + FUSE_SETLK = 32 + FUSE_SETLKW = 33 + FUSE_ACCESS = 34 + FUSE_CREATE = 35 + FUSE_INTERRUPT = 36 + FUSE_BMAP = 37 + FUSE_DESTROY = 38 + FUSE_IOCTL = 39 + FUSE_POLL = 40 + FUSE_NOTIFY_REPLY = 41 + FUSE_BATCH_FORGET = 42 +) + +const ( + // FUSE_MIN_READ_BUFFER is the minimum size the read can be for any FUSE filesystem. + // This is the minimum size Linux supports. See linux.fuse.h. + FUSE_MIN_READ_BUFFER uint32 = 8192 +) + +// FUSEHeaderIn is the header read by the daemon with each request. +// +// +marshal +type FUSEHeaderIn struct { + // Len specifies the total length of the data, including this header. + Len uint32 + + // Opcode specifies the kind of operation of the request. + Opcode FUSEOpcode + + // Unique specifies the unique identifier for this request. + Unique FUSEOpID + + // NodeID is the ID of the filesystem object being operated on. + NodeID uint64 + + // UID is the UID of the requesting process. + UID uint32 + + // GID is the GID of the requesting process. + GID uint32 + + // PID is the PID of the requesting process. + PID uint32 + + _ uint32 +} + +// FUSEHeaderOut is the header written by the daemon when it processes +// a request and wants to send a reply (almost all operations require a +// reply; if they do not, this will be explicitly documented). +// +// +marshal +type FUSEHeaderOut struct { + // Len specifies the total length of the data, including this header. + Len uint32 + + // Error specifies the error that occurred (0 if none). + Error int32 + + // Unique specifies the unique identifier of the corresponding request. + Unique FUSEOpID +} + +// FUSEWriteIn is the header written by a daemon when it makes a +// write request to the FUSE filesystem. +// +// +marshal +type FUSEWriteIn struct { + // Fh specifies the file handle that is being written to. + Fh uint64 + + // Offset is the offset of the write. + Offset uint64 + + // Size is the size of data being written. + Size uint32 + + // WriteFlags is the flags used during the write. + WriteFlags uint32 + + // LockOwner is the ID of the lock owner. + LockOwner uint64 + + // Flags is the flags for the request. + Flags uint32 + + _ uint32 +} diff --git a/pkg/abi/linux/futex.go b/pkg/abi/linux/futex.go index 08bfde3b5..8138088a6 100644 --- a/pkg/abi/linux/futex.go +++ b/pkg/abi/linux/futex.go @@ -60,3 +60,21 @@ const ( FUTEX_WAITERS = 0x80000000 FUTEX_OWNER_DIED = 0x40000000 ) + +// FUTEX_BITSET_MATCH_ANY has all bits set. +const FUTEX_BITSET_MATCH_ANY = 0xffffffff + +// ROBUST_LIST_LIMIT protects against a deliberately circular list. +const ROBUST_LIST_LIMIT = 2048 + +// RobustListHead corresponds to Linux's struct robust_list_head. +// +// +marshal +type RobustListHead struct { + List uint64 + FutexOffset uint64 + ListOpPending uint64 +} + +// SizeOfRobustListHead is the size of a RobustListHead struct. +var SizeOfRobustListHead = (*RobustListHead)(nil).SizeBytes() diff --git a/pkg/abi/linux/netdevice.go b/pkg/abi/linux/netdevice.go index 7866352b4..0faf015c7 100644 --- a/pkg/abi/linux/netdevice.go +++ b/pkg/abi/linux/netdevice.go @@ -22,6 +22,8 @@ const ( ) // IFReq is an interface request. +// +// +marshal type IFReq struct { // IFName is an encoded name, normally null-terminated. This should be // accessed via the Name and SetName functions. @@ -79,6 +81,8 @@ type IFMap struct { // IFConf is used to return a list of interfaces and their addresses. See // netdevice(7) and struct ifconf for more detail on its use. +// +// +marshal type IFConf struct { Len int32 _ [4]byte // Pad to sizeof(struct ifconf). diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index 46d8b0b42..a91f9f018 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -14,6 +14,14 @@ package linux +import ( + "io" + + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/tools/go_marshal/marshal" + "gvisor.dev/gvisor/tools/go_marshal/primitive" +) + // This file contains structures required to support netfilter, specifically // the iptables tool. @@ -76,6 +84,8 @@ const ( // IPTEntry is an iptable rule. It corresponds to struct ipt_entry in // include/uapi/linux/netfilter_ipv4/ip_tables.h. +// +// +marshal type IPTEntry struct { // IP is used to filter packets based on the IP header. IP IPTIP @@ -112,21 +122,41 @@ type IPTEntry struct { // SizeOfIPTEntry is the size of an IPTEntry. const SizeOfIPTEntry = 112 -// KernelIPTEntry is identical to IPTEntry, but includes the Elems field. This -// struct marshaled via the binary package to write an IPTEntry to userspace. +// KernelIPTEntry is identical to IPTEntry, but includes the Elems field. +// KernelIPTEntry itself is not Marshallable but it implements some methods of +// marshal.Marshallable that help in other implementations of Marshallable. type KernelIPTEntry struct { - IPTEntry + Entry IPTEntry // Elems holds the data for all this rule's matches followed by the // target. It is variable length -- users have to iterate over any // matches and use TargetOffset and NextOffset to make sense of the // data. - Elems []byte + Elems primitive.ByteSlice +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (ke *KernelIPTEntry) SizeBytes() int { + return ke.Entry.SizeBytes() + ke.Elems.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (ke *KernelIPTEntry) MarshalBytes(dst []byte) { + ke.Entry.MarshalBytes(dst) + ke.Elems.MarshalBytes(dst[ke.Entry.SizeBytes():]) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (ke *KernelIPTEntry) UnmarshalBytes(src []byte) { + ke.Entry.UnmarshalBytes(src) + ke.Elems.UnmarshalBytes(src[ke.Entry.SizeBytes():]) } // IPTIP contains information for matching a packet's IP header. // It corresponds to struct ipt_ip in // include/uapi/linux/netfilter_ipv4/ip_tables.h. +// +// +marshal type IPTIP struct { // Src is the source IP address. Src InetAddr @@ -189,6 +219,8 @@ const SizeOfIPTIP = 84 // XTCounters holds packet and byte counts for a rule. It corresponds to struct // xt_counters in include/uapi/linux/netfilter/x_tables.h. +// +// +marshal type XTCounters struct { // Pcnt is the packet count. Pcnt uint64 @@ -321,6 +353,8 @@ const SizeOfXTRedirectTarget = 56 // IPTGetinfo is the argument for the IPT_SO_GET_INFO sockopt. It corresponds // to struct ipt_getinfo in include/uapi/linux/netfilter_ipv4/ip_tables.h. +// +// +marshal type IPTGetinfo struct { Name TableName ValidHooks uint32 @@ -336,6 +370,8 @@ const SizeOfIPTGetinfo = 84 // IPTGetEntries is the argument for the IPT_SO_GET_ENTRIES sockopt. It // corresponds to struct ipt_get_entries in // include/uapi/linux/netfilter_ipv4/ip_tables.h. +// +// +marshal type IPTGetEntries struct { Name TableName Size uint32 @@ -350,13 +386,103 @@ type IPTGetEntries struct { const SizeOfIPTGetEntries = 40 // KernelIPTGetEntries is identical to IPTGetEntries, but includes the -// Entrytable field. This struct marshaled via the binary package to write an -// KernelIPTGetEntries to userspace. +// Entrytable field. This has been manually made marshal.Marshallable since it +// is dynamically sized. type KernelIPTGetEntries struct { IPTGetEntries Entrytable []KernelIPTEntry } +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (ke *KernelIPTGetEntries) SizeBytes() int { + res := ke.IPTGetEntries.SizeBytes() + for _, entry := range ke.Entrytable { + res += entry.SizeBytes() + } + return res +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (ke *KernelIPTGetEntries) MarshalBytes(dst []byte) { + ke.IPTGetEntries.MarshalBytes(dst) + marshalledUntil := ke.IPTGetEntries.SizeBytes() + for i := 0; i < len(ke.Entrytable); i++ { + ke.Entrytable[i].MarshalBytes(dst[marshalledUntil:]) + marshalledUntil += ke.Entrytable[i].SizeBytes() + } +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (ke *KernelIPTGetEntries) UnmarshalBytes(src []byte) { + ke.IPTGetEntries.UnmarshalBytes(src) + unmarshalledUntil := ke.IPTGetEntries.SizeBytes() + for i := 0; i < len(ke.Entrytable); i++ { + ke.Entrytable[i].UnmarshalBytes(src[unmarshalledUntil:]) + unmarshalledUntil += ke.Entrytable[i].SizeBytes() + } +} + +// Packed implements marshal.Marshallable.Packed. +func (ke *KernelIPTGetEntries) Packed() bool { + // KernelIPTGetEntries isn't packed because the ke.Entrytable contains an + // indirection to the actual data we want to marshal (the slice data + // pointer), and the memory for KernelIPTGetEntries contains the slice + // header which we don't want to marshal. + return false +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (ke *KernelIPTGetEntries) MarshalUnsafe(dst []byte) { + // Fall back to safe Marshal because the type in not packed. + ke.MarshalBytes(dst) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (ke *KernelIPTGetEntries) UnmarshalUnsafe(src []byte) { + // Fall back to safe Unmarshal because the type in not packed. + ke.UnmarshalBytes(src) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +func (ke *KernelIPTGetEntries) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + buf := task.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay. + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Unmarshal unconditionally. If we had a short copy-in, this results in a + // partially unmarshalled struct. + ke.UnmarshalBytes(buf) // escapes: fallback. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +func (ke *KernelIPTGetEntries) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + // Type KernelIPTGetEntries doesn't have a packed layout in memory, fall + // back to MarshalBytes. + return task.CopyOutBytes(addr, ke.marshalAll(task)) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +func (ke *KernelIPTGetEntries) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Type KernelIPTGetEntries doesn't have a packed layout in memory, fall + // back to MarshalBytes. + return task.CopyOutBytes(addr, ke.marshalAll(task)[:limit]) +} + +func (ke *KernelIPTGetEntries) marshalAll(task marshal.Task) []byte { + buf := task.CopyScratchBuffer(ke.SizeBytes()) + ke.MarshalBytes(buf) + return buf +} + +// WriteTo implements io.WriterTo.WriteTo. +func (ke *KernelIPTGetEntries) WriteTo(w io.Writer) (int64, error) { + buf := make([]byte, ke.SizeBytes()) + ke.MarshalBytes(buf) + length, err := w.Write(buf) + return int64(length), err +} + +var _ marshal.Marshallable = (*KernelIPTGetEntries)(nil) + // IPTReplace is the argument for the IPT_SO_SET_REPLACE sockopt. It // corresponds to struct ipt_replace in // include/uapi/linux/netfilter_ipv4/ip_tables.h. @@ -374,12 +500,6 @@ type IPTReplace struct { // Entries [0]IPTEntry } -// KernelIPTReplace is identical to IPTReplace, but includes the Entries field. -type KernelIPTReplace struct { - IPTReplace - Entries [0]IPTEntry -} - // SizeOfIPTReplace is the size of an IPTReplace. const SizeOfIPTReplace = 96 @@ -392,6 +512,8 @@ func (en ExtensionName) String() string { } // TableName holds the name of a netfilter table. +// +// +marshal type TableName [XT_TABLE_MAXNAMELEN]byte // String implements fmt.Stringer. diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go index 4a14ef691..c24a8216e 100644 --- a/pkg/abi/linux/socket.go +++ b/pkg/abi/linux/socket.go @@ -134,6 +134,15 @@ const ( SHUT_RDWR = 2 ) +// Packet types from <linux/if_packet.h> +const ( + PACKET_HOST = 0 // To us + PACKET_BROADCAST = 1 // To all + PACKET_MULTICAST = 2 // To group + PACKET_OTHERHOST = 3 // To someone else + PACKET_OUTGOING = 4 // Outgoing of any type +) + // Socket options from socket.h. const ( SO_DEBUG = 1 @@ -225,6 +234,8 @@ const ( const SockAddrMax = 128 // InetAddr is struct in_addr, from uapi/linux/in.h. +// +// +marshal type InetAddr [4]byte // SockAddrInet is struct sockaddr_in, from uapi/linux/in.h. @@ -294,6 +305,8 @@ func (s *SockAddrUnix) implementsSockAddr() {} func (s *SockAddrNetlink) implementsSockAddr() {} // Linger is struct linger, from include/linux/socket.h. +// +// +marshal type Linger struct { OnOff int32 Linger int32 @@ -308,6 +321,8 @@ const SizeOfLinger = 8 // the end of this struct or within existing unusued space, so its size grows // over time. The current iteration is based on linux v4.17. New versions are // always backwards compatible. +// +// +marshal type TCPInfo struct { State uint8 CaState uint8 @@ -405,6 +420,8 @@ var SizeOfControlMessageHeader = int(binary.Size(ControlMessageHeader{})) // A ControlMessageCredentials is an SCM_CREDENTIALS socket control message. // // ControlMessageCredentials represents struct ucred from linux/socket.h. +// +// +marshal type ControlMessageCredentials struct { PID int32 UID uint32 diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go index 57b89ad7d..2cb59f934 100644 --- a/pkg/p9/messages.go +++ b/pkg/p9/messages.go @@ -2506,7 +2506,7 @@ type msgFactory struct { var msgRegistry registry type registry struct { - factories [math.MaxUint8]msgFactory + factories [math.MaxUint8 + 1]msgFactory // largestFixedSize is computed so that given some message size M, you can // compute the maximum payload size (e.g. for Twrite, Rread) with diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index daba8b172..fd95eb2d2 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -28,7 +28,14 @@ import ( ) // Registers represents the CPU registers for this architecture. -type Registers = linux.PtraceRegs +// +// +stateify savable +type Registers struct { + linux.PtraceRegs + + // TPIDR_EL0 is the EL0 Read/Write Software Thread ID Register. + TPIDR_EL0 uint64 +} const ( // SyscallWidth is the width of insturctions. @@ -101,9 +108,6 @@ type State struct { // Our floating point state. aarch64FPState `state:"wait"` - // TLS pointer - TPValue uint64 - // FeatureSet is a pointer to the currently active feature set. FeatureSet *cpuid.FeatureSet @@ -157,7 +161,6 @@ func (s *State) Fork() State { return State{ Regs: s.Regs, aarch64FPState: s.aarch64FPState.fork(), - TPValue: s.TPValue, FeatureSet: s.FeatureSet, OrigR0: s.OrigR0, } @@ -241,18 +244,18 @@ func (s *State) ptraceGetRegs() Registers { return s.Regs } -var registersSize = (*Registers)(nil).SizeBytes() +var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes() // PtraceSetRegs implements Context.PtraceSetRegs. func (s *State) PtraceSetRegs(src io.Reader) (int, error) { var regs Registers - buf := make([]byte, registersSize) + buf := make([]byte, ptraceRegistersSize) if _, err := io.ReadFull(src, buf); err != nil { return 0, err } regs.UnmarshalUnsafe(buf) s.Regs = regs - return registersSize, nil + return ptraceRegistersSize, nil } // PtraceGetFPRegs implements Context.PtraceGetFPRegs. @@ -278,7 +281,7 @@ const ( func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < registersSize { + if maxlen < ptraceRegistersSize { return 0, syserror.EFAULT } return s.PtraceGetRegs(dst) @@ -291,7 +294,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < registersSize { + if maxlen < ptraceRegistersSize { return 0, syserror.EFAULT } return s.PtraceSetRegs(src) diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index 3b3a0a272..1c3e3c14c 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -300,7 +300,7 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) { // PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and // u_debugreg, returning 0 or silently no-oping for other fields // respectively. - if addr < uintptr(registersSize) { + if addr < uintptr(ptraceRegistersSize) { regs := c.ptraceGetRegs() buf := make([]byte, regs.SizeBytes()) regs.MarshalUnsafe(buf) @@ -315,7 +315,7 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error { if addr&7 != 0 || addr >= userStructSize { return syscall.EIO } - if addr < uintptr(registersSize) { + if addr < uintptr(ptraceRegistersSize) { regs := c.ptraceGetRegs() buf := make([]byte, regs.SizeBytes()) regs.MarshalUnsafe(buf) diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index ada7ac7b8..cabbf60e0 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -142,7 +142,7 @@ func (c *context64) SetStack(value uintptr) { // TLS returns the current TLS pointer. func (c *context64) TLS() uintptr { - return uintptr(c.TPValue) + return uintptr(c.Regs.TPIDR_EL0) } // SetTLS sets the current TLS pointer. Returns false if value is invalid. @@ -151,7 +151,7 @@ func (c *context64) SetTLS(value uintptr) bool { return false } - c.TPValue = uint64(value) + c.Regs.TPIDR_EL0 = uint64(value) return true } diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index dc458b37f..b9405b320 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -31,7 +31,11 @@ import ( ) // Registers represents the CPU registers for this architecture. -type Registers = linux.PtraceRegs +// +// +stateify savable +type Registers struct { + linux.PtraceRegs +} // System-related constants for x86. const ( @@ -311,12 +315,12 @@ func (s *State) ptraceGetRegs() Registers { return regs } -var registersSize = (*Registers)(nil).SizeBytes() +var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes() // PtraceSetRegs implements Context.PtraceSetRegs. func (s *State) PtraceSetRegs(src io.Reader) (int, error) { var regs Registers - buf := make([]byte, registersSize) + buf := make([]byte, ptraceRegistersSize) if _, err := io.ReadFull(src, buf); err != nil { return 0, err } @@ -374,7 +378,7 @@ func (s *State) PtraceSetRegs(src io.Reader) (int, error) { } regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable) s.Regs = regs - return registersSize, nil + return ptraceRegistersSize, nil } // isUserSegmentSelector returns true if the given segment selector specifies a @@ -543,7 +547,7 @@ const ( func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < registersSize { + if maxlen < ptraceRegistersSize { return 0, syserror.EFAULT } return s.PtraceGetRegs(dst) @@ -563,7 +567,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < registersSize { + if maxlen < ptraceRegistersSize { return 0, syserror.EFAULT } return s.PtraceSetRegs(src) diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go index beba0f771..f5537411e 100644 --- a/pkg/sentry/fs/file_operations.go +++ b/pkg/sentry/fs/file_operations.go @@ -160,6 +160,7 @@ type FileOperations interface { // refer. // // Preconditions: The AddressSpace (if any) that io refers to is activated. + // Must only be called from a task goroutine. Ioctl(ctx context.Context, file *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) } diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 789369220..5fb419bcd 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -8,7 +8,6 @@ go_template_instance( out = "dirty_set_impl.go", imports = { "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", }, package = "fsutil", prefix = "Dirty", @@ -25,14 +24,14 @@ go_template_instance( name = "frame_ref_set_impl", out = "frame_ref_set_impl.go", imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "fsutil", prefix = "FrameRef", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "uint64", "Functions": "FrameRefSetFunctions", }, @@ -43,7 +42,6 @@ go_template_instance( out = "file_range_set_impl.go", imports = { "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", }, package = "fsutil", prefix = "FileRange", @@ -86,7 +84,6 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/state", diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go index c6cd45087..2c9446c1d 100644 --- a/pkg/sentry/fs/fsutil/dirty_set.go +++ b/pkg/sentry/fs/fsutil/dirty_set.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -159,7 +158,7 @@ func (ds *DirtySet) AllowClean(mr memmap.MappableRange) { // repeatedly until all bytes have been written. max is the true size of the // cached object; offsets beyond max will not be passed to writeAt, even if // they are marked dirty. -func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { var changedDirty bool defer func() { if changedDirty { @@ -194,7 +193,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet // successful partial write, SyncDirtyAll will call it repeatedly until all // bytes have been written. max is the true size of the cached object; offsets // beyond max will not be passed to writeAt, even if they are marked dirty. -func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { dseg := dirty.FirstSegment() for dseg.Ok() { if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil { @@ -210,7 +209,7 @@ func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max } // Preconditions: mr must be page-aligned. -func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() { wbr := cseg.Range().Intersect(mr) if max < wbr.Start { diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go index 5643cdac9..bbafebf03 100644 --- a/pkg/sentry/fs/fsutil/file_range_set.go +++ b/pkg/sentry/fs/fsutil/file_range_set.go @@ -23,13 +23,12 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/usermem" ) // FileRangeSet maps offsets into a memmap.Mappable to offsets into a -// platform.File. It is used to implement Mappables that store data in +// memmap.File. It is used to implement Mappables that store data in // sparsely-allocated memory. // // type FileRangeSet <generated by go_generics> @@ -65,20 +64,20 @@ func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, spli } // FileRange returns the FileRange mapped by seg. -func (seg FileRangeIterator) FileRange() platform.FileRange { +func (seg FileRangeIterator) FileRange() memmap.FileRange { return seg.FileRangeOf(seg.Range()) } // FileRangeOf returns the FileRange mapped by mr. // // Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0. -func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange { +func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange { frstart := seg.Value() + (mr.Start - seg.Start()) - return platform.FileRange{frstart, frstart + mr.Length()} + return memmap.FileRange{frstart, frstart + mr.Length()} } // Fill attempts to ensure that all memmap.Mappable offsets in required are -// mapped to a platform.File offset, by allocating from mf with the given +// mapped to a memmap.File offset, by allocating from mf with the given // memory usage kind and invoking readAt to store data into memory. (If readAt // returns a successful partial read, Fill will call it repeatedly until all // bytes have been read.) EOF is handled consistently with the requirements of @@ -141,7 +140,7 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map } // Drop removes segments for memmap.Mappable offsets in mr, freeing the -// corresponding platform.FileRanges. +// corresponding memmap.FileRanges. // // Preconditions: mr must be page-aligned. func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) { @@ -154,7 +153,7 @@ func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) { } // DropAll removes all segments in mr, freeing the corresponding -// platform.FileRanges. +// memmap.FileRanges. func (frs *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) { for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { mf.DecRef(seg.FileRange()) diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go index dd6f5aba6..a808894df 100644 --- a/pkg/sentry/fs/fsutil/frame_ref_set.go +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -17,7 +17,7 @@ package fsutil import ( "math" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" ) @@ -39,7 +39,7 @@ func (FrameRefSetFunctions) ClearValue(val *uint64) { } // Merge implements segment.Functions.Merge. -func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { +func (FrameRefSetFunctions) Merge(_ memmap.FileRange, val1 uint64, _ memmap.FileRange, val2 uint64) (uint64, bool) { if val1 != val2 { return 0, false } @@ -47,13 +47,13 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform. } // Split implements segment.Functions.Split. -func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { +func (FrameRefSetFunctions) Split(_ memmap.FileRange, val uint64, _ uint64) (uint64, uint64) { return val, val } // IncRefAndAccount adds a reference on the range fr. All newly inserted segments // are accounted as host page cache memory mappings. -func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { +func (refs *FrameRefSet) IncRefAndAccount(fr memmap.FileRange) { seg, gap := refs.Find(fr.Start) for { switch { @@ -74,7 +74,7 @@ func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { // DecRefAndAccount removes a reference on the range fr and untracks segments // that are removed from memory accounting. -func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) { +func (refs *FrameRefSet) DecRefAndAccount(fr memmap.FileRange) { seg := refs.FindSegment(fr.Start) for seg.Ok() && seg.Start() < fr.End { diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index e82afd112..ef0113b52 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) @@ -126,7 +125,7 @@ func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { // offsets in fr or until the next call to UnmapAll. // // Preconditions: The caller must hold a reference on all offsets in fr. -func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) { +func (f *HostFileMapper) MapInternal(fr memmap.FileRange, fd int, write bool) (safemem.BlockSeq, error) { chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) f.mapsMu.Lock() defer f.mapsMu.Unlock() @@ -146,7 +145,7 @@ func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) } // Preconditions: f.mapsMu must be locked. -func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error { +func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, write bool, fn func(safemem.Block)) error { prot := syscall.PROT_READ if write { prot |= syscall.PROT_WRITE diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go index 78fec553e..c15d8a946 100644 --- a/pkg/sentry/fs/fsutil/host_mappable.go +++ b/pkg/sentry/fs/fsutil/host_mappable.go @@ -21,18 +21,17 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) -// HostMappable implements memmap.Mappable and platform.File over a +// HostMappable implements memmap.Mappable and memmap.File over a // CachedFileObject. // // Lock order (compare the lock order model in mm/mm.go): // truncateMu ("fs locks") // mu ("memmap.Mappable locks not taken by Translate") -// ("platform.File locks") +// ("memmap.File locks") // backingFile ("CachedFileObject locks") // // +stateify savable @@ -124,24 +123,24 @@ func (h *HostMappable) NotifyChangeFD() error { return nil } -// MapInternal implements platform.File.MapInternal. -func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (h *HostMappable) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write) } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (h *HostMappable) FD() int { return h.backingFile.FD() } -// IncRef implements platform.File.IncRef. -func (h *HostMappable) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (h *HostMappable) IncRef(fr memmap.FileRange) { mr := memmap.MappableRange{Start: fr.Start, End: fr.End} h.hostFileMapper.IncRefOn(mr) } -// DecRef implements platform.File.DecRef. -func (h *HostMappable) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (h *HostMappable) DecRef(fr memmap.FileRange) { mr := memmap.MappableRange{Start: fr.Start, End: fr.End} h.hostFileMapper.DecRefOn(mr) } diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 800c8b4e1..fe8b0b6ac 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -26,7 +26,6 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -934,7 +933,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error { - // Whether we have a host fd (and consequently what platform.File is + // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. c.mapsMu.Lock() @@ -999,10 +998,10 @@ func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.Evictable } } -// IncRef implements platform.File.IncRef. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// IncRef implements memmap.File.IncRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. -func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { +func (c *CachingInodeOperations) IncRef(fr memmap.FileRange) { // Hot path. Avoid defers. c.dataMu.Lock() seg, gap := c.refs.Find(fr.Start) @@ -1024,10 +1023,10 @@ func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { } } -// DecRef implements platform.File.DecRef. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// DecRef implements memmap.File.DecRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. -func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { +func (c *CachingInodeOperations) DecRef(fr memmap.FileRange) { // Hot path. Avoid defers. c.dataMu.Lock() seg := c.refs.FindSegment(fr.Start) @@ -1046,15 +1045,15 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { c.dataMu.Unlock() } -// MapInternal implements platform.File.MapInternal. This is used when we +// MapInternal implements memmap.File.MapInternal. This is used when we // directly map an underlying host fd and CachingInodeOperations is used as the -// platform.File during translation. -func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// memmap.File during translation. +func (c *CachingInodeOperations) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write) } -// FD implements platform.File.FD. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// FD implements memmap.File.FD. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. func (c *CachingInodeOperations) FD() int { return c.backingFile.FD() diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index 2018b978a..a91cae3ef 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -132,7 +132,7 @@ func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequen return sfd.inode.t.ld.outputQueueWrite(ctx, src) } -// Ioctl implements vfs.FileDescripionImpl.Ioctl. +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch cmd := args[1].Uint(); cmd { case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 3e00c2abb..67649e811 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -1,20 +1,63 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "request_list", + out = "request_list.go", + package = "fuse", + prefix = "request", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Request", + "Linker": "*Request", + }, +) + go_library( name = "fuse", srcs = [ + "connection.go", "dev.go", + "fusefs.go", + "register.go", + "request_list.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/log", "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + "//tools/go_marshal/marshal", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "dev_test", + size = "small", + srcs = ["dev_test.go"], + library = ":fuse", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/fsimpl/testutil", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", + "//pkg/waiter", + "//tools/go_marshal/marshal", ], ) diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go new file mode 100644 index 000000000..f330da0bd --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -0,0 +1,255 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "errors" + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" +) + +// MaxActiveRequestsDefault is the default setting controlling the upper bound +// on the number of active requests at any given time. +const MaxActiveRequestsDefault = 10000 + +var ( + // Ordinary requests have even IDs, while interrupts IDs are odd. + InitReqBit uint64 = 1 + ReqIDStep uint64 = 2 +) + +// Request represents a FUSE operation request that hasn't been sent to the +// server yet. +// +// +stateify savable +type Request struct { + requestEntry + + id linux.FUSEOpID + hdr *linux.FUSEHeaderIn + data []byte +} + +// Response represents an actual response from the server, including the +// response payload. +// +// +stateify savable +type Response struct { + opcode linux.FUSEOpcode + hdr linux.FUSEHeaderOut + data []byte +} + +// Connection is the struct by which the sentry communicates with the FUSE server daemon. +type Connection struct { + fd *DeviceFD + + // MaxWrite is the daemon's maximum size of a write buffer. + // This is negotiated during FUSE_INIT. + MaxWrite uint32 +} + +// NewFUSEConnection creates a FUSE connection to fd +func NewFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*Connection, error) { + // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to + // mount a FUSE filesystem. + fuseFD := fd.Impl().(*DeviceFD) + fuseFD.mounted = true + + // Create the writeBuf for the header to be stored in. + hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + fuseFD.writeBuf = make([]byte, hdrLen) + fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse) + fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests) + fuseFD.writeCursor = 0 + + return &Connection{ + fd: fuseFD, + }, nil +} + +// NewRequest creates a new request that can be sent to the FUSE server. +func (conn *Connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { + conn.fd.mu.Lock() + defer conn.fd.mu.Unlock() + conn.fd.nextOpID += linux.FUSEOpID(ReqIDStep) + + hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() + hdr := linux.FUSEHeaderIn{ + Len: uint32(hdrLen + payload.SizeBytes()), + Opcode: opcode, + Unique: conn.fd.nextOpID, + NodeID: ino, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + PID: pid, + } + + buf := make([]byte, hdr.Len) + hdr.MarshalUnsafe(buf[:hdrLen]) + payload.MarshalUnsafe(buf[hdrLen:]) + + return &Request{ + id: hdr.Unique, + hdr: &hdr, + data: buf, + }, nil +} + +// Call makes a request to the server and blocks the invoking task until a +// server responds with a response. +// NOTE: If no task is provided then the Call will simply enqueue the request +// and return a nil response. No blocking will happen in this case. Instead, +// this is used to signify that the processing of this request will happen by +// the kernel.Task that writes the response. See FUSE_INIT for such an +// invocation. +func (conn *Connection) Call(t *kernel.Task, r *Request) (*Response, error) { + fut, err := conn.callFuture(t, r) + if err != nil { + return nil, err + } + + return fut.resolve(t) +} + +// Error returns the error of the FUSE call. +func (r *Response) Error() error { + errno := r.hdr.Error + if errno >= 0 { + return nil + } + + sysErrNo := syscall.Errno(-errno) + return error(sysErrNo) +} + +// UnmarshalPayload unmarshals the response data into m. +func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { + hdrLen := r.hdr.SizeBytes() + haveDataLen := r.hdr.Len - uint32(hdrLen) + wantDataLen := uint32(m.SizeBytes()) + + if haveDataLen < wantDataLen { + return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen) + } + + m.UnmarshalUnsafe(r.data[hdrLen:]) + return nil +} + +// callFuture makes a request to the server and returns a future response. +// Call resolve() when the response needs to be fulfilled. +func (conn *Connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { + conn.fd.mu.Lock() + defer conn.fd.mu.Unlock() + + // Is the queue full? + // + // We must busy wait here until the request can be queued. We don't + // block on the fd.fullQueueCh with a lock - so after being signalled, + // before we acquire the lock, it is possible that a barging task enters + // and queues a request. As a result, upon acquiring the lock we must + // again check if the room is available. + // + // This can potentially starve a request forever but this can only happen + // if there are always too many ongoing requests all the time. The + // supported maxActiveRequests setting should be really high to avoid this. + for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests { + if t == nil { + // Since there is no task that is waiting. We must error out. + return nil, errors.New("FUSE request queue full") + } + + log.Infof("Blocking request %v from being queued. Too many active requests: %v", + r.id, conn.fd.numActiveRequests) + conn.fd.mu.Unlock() + err := t.Block(conn.fd.fullQueueCh) + conn.fd.mu.Lock() + if err != nil { + return nil, err + } + } + + return conn.callFutureLocked(t, r) +} + +// callFutureLocked makes a request to the server and returns a future response. +func (conn *Connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { + conn.fd.queue.PushBack(r) + conn.fd.numActiveRequests += 1 + fut := newFutureResponse(r.hdr.Opcode) + conn.fd.completions[r.id] = fut + + // Signal the readers that there is something to read. + conn.fd.waitQueue.Notify(waiter.EventIn) + + return fut, nil +} + +// futureResponse represents an in-flight request, that may or may not have +// completed yet. Convert it to a resolved Response by calling Resolve, but note +// that this may block. +// +// +stateify savable +type futureResponse struct { + opcode linux.FUSEOpcode + ch chan struct{} + hdr *linux.FUSEHeaderOut + data []byte +} + +// newFutureResponse creates a future response to a FUSE request. +func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse { + return &futureResponse{ + opcode: opcode, + ch: make(chan struct{}), + } +} + +// resolve blocks the task until the server responds to its corresponding request, +// then returns a resolved response. +func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) { + // If there is no Task associated with this request - then we don't try to resolve + // the response. Instead, the task writing the response (proxy to the server) will + // process the response on our behalf. + if t == nil { + log.Infof("fuse.Response.resolve: Not waiting on a response from server.") + return nil, nil + } + + if err := t.Block(f.ch); err != nil { + return nil, err + } + + return f.getResponse(), nil +} + +// getResponse creates a Response from the data the futureResponse has. +func (f *futureResponse) getResponse() *Response { + return &Response{ + opcode: f.opcode, + hdr: *f.hdr, + data: f.data, + } +} diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index dc33268af..f3443ac71 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -15,13 +15,17 @@ package fuse import ( + "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) const fuseDevMinor = 229 @@ -51,9 +55,46 @@ type DeviceFD struct { vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD - // TODO(gvisor.dev/issue/2987): Add all the data structures needed to enqueue - // and deque requests, control synchronization and establish communication - // between the FUSE kernel module and the /dev/fuse character device. + // mounted specifies whether a FUSE filesystem was mounted using the DeviceFD. + mounted bool + + // nextOpID is used to create new requests. + nextOpID linux.FUSEOpID + + // queue is the list of requests that need to be processed by the FUSE server. + queue requestList + + // numActiveRequests is the number of requests made by the Sentry that has + // yet to be responded to. + numActiveRequests uint64 + + // completions is used to map a request to its response. A Writer will use this + // to notify the caller of a completed response. + completions map[linux.FUSEOpID]*futureResponse + + writeCursor uint32 + + // writeBuf is the memory buffer used to copy in the FUSE out header from + // userspace. + writeBuf []byte + + // writeCursorFR current FR being copied from server. + writeCursorFR *futureResponse + + // mu protects all the queues, maps, buffers and cursors and nextOpID. + mu sync.Mutex + + // waitQueue is used to notify interested parties when the device becomes + // readable or writable. + waitQueue waiter.Queue + + // fullQueueCh is a channel used to synchronize the readers with the writers. + // Writers (inbound requests to the filesystem) block if there are too many + // unprocessed in-flight requests. + fullQueueCh chan struct{} + + // fs is the FUSE filesystem that this FD is being used for. + fs *filesystem } // Release implements vfs.FileDescriptionImpl.Release. @@ -61,45 +102,293 @@ func (fd *DeviceFD) Release() {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if !fd.mounted { + return 0, syserror.EPERM + } + return 0, syserror.ENOSYS } // Read implements vfs.FileDescriptionImpl.Read. func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.ENOSYS + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if !fd.mounted { + return 0, syserror.EPERM + } + + // We require that any Read done on this filesystem have a sane minimum + // read buffer. It must have the capacity for the fixed parts of any request + // header (Linux uses the request header and the FUSEWriteIn header for this + // calculation) + the negotiated MaxWrite room for the data. + minBuffSize := linux.FUSE_MIN_READ_BUFFER + inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes()) + writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes()) + negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.MaxWrite + if minBuffSize < negotiatedMinBuffSize { + minBuffSize = negotiatedMinBuffSize + } + + // If the read buffer is too small, error out. + if dst.NumBytes() < int64(minBuffSize) { + return 0, syserror.EINVAL + } + + fd.mu.Lock() + defer fd.mu.Unlock() + return fd.readLocked(ctx, dst, opts) +} + +// readLocked implements the reading of the fuse device while locked with DeviceFD.mu. +func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + if fd.queue.Empty() { + return 0, syserror.ErrWouldBlock + } + + var readCursor uint32 + var bytesRead int64 + for { + req := fd.queue.Front() + if dst.NumBytes() < int64(req.hdr.Len) { + // The request is too large. Cannot process it. All requests must be smaller than the + // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT + // handshake. + errno := -int32(syscall.EIO) + if req.hdr.Opcode == linux.FUSE_SETXATTR { + errno = -int32(syscall.E2BIG) + } + + // Return the error to the calling task. + if err := fd.sendError(ctx, errno, req); err != nil { + return 0, err + } + + // We're done with this request. + fd.queue.Remove(req) + + // Restart the read as this request was invalid. + log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.") + return fd.readLocked(ctx, dst, opts) + } + + n, err := dst.CopyOut(ctx, req.data[readCursor:]) + if err != nil { + return 0, err + } + readCursor += uint32(n) + bytesRead += int64(n) + + if readCursor >= req.hdr.Len { + // Fully done with this req, remove it from the queue. + fd.queue.Remove(req) + break + } + } + + return bytesRead, nil } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if !fd.mounted { + return 0, syserror.EPERM + } + return 0, syserror.ENOSYS } // Write implements vfs.FileDescriptionImpl.Write. func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ENOSYS + fd.mu.Lock() + defer fd.mu.Unlock() + return fd.writeLocked(ctx, src, opts) +} + +// writeLocked implements writing to the fuse device while locked with DeviceFD.mu. +func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if !fd.mounted { + return 0, syserror.EPERM + } + + var cn, n int64 + hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + + for src.NumBytes() > 0 { + if fd.writeCursorFR != nil { + // Already have common header, and we're now copying the payload. + wantBytes := fd.writeCursorFR.hdr.Len + + // Note that the FR data doesn't have the header. Copy it over if its necessary. + if fd.writeCursorFR.data == nil { + fd.writeCursorFR.data = make([]byte, wantBytes) + } + + bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes]) + if err != nil { + return 0, err + } + src = src.DropFirst(bytesCopied) + + cn = int64(bytesCopied) + n += cn + fd.writeCursor += uint32(cn) + if fd.writeCursor == wantBytes { + // Done reading this full response. Clean up and unblock the + // initiator. + break + } + + // Check if we have more data in src. + continue + } + + // Assert that the header isn't read into the writeBuf yet. + if fd.writeCursor >= hdrLen { + return 0, syserror.EINVAL + } + + // We don't have the full common response header yet. + wantBytes := hdrLen - fd.writeCursor + bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes]) + if err != nil { + return 0, err + } + src = src.DropFirst(bytesCopied) + + cn = int64(bytesCopied) + n += cn + fd.writeCursor += uint32(cn) + if fd.writeCursor == hdrLen { + // Have full header in the writeBuf. Use it to fetch the actual futureResponse + // from the device's completions map. + var hdr linux.FUSEHeaderOut + hdr.UnmarshalBytes(fd.writeBuf) + + // We have the header now and so the writeBuf has served its purpose. + // We could reset it manually here but instead of doing that, at the + // end of the write, the writeCursor will be set to 0 thereby allowing + // the next request to overwrite whats in the buffer, + + fut, ok := fd.completions[hdr.Unique] + if !ok { + // Server sent us a response for a request we never sent? + return 0, syserror.EINVAL + } + + delete(fd.completions, hdr.Unique) + + // Copy over the header into the future response. The rest of the payload + // will be copied over to the FR's data in the next iteration. + fut.hdr = &hdr + fd.writeCursorFR = fut + + // Next iteration will now try read the complete request, if src has + // any data remaining. Otherwise we're done. + } + } + + if fd.writeCursorFR != nil { + if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil { + return 0, err + } + + // Ready the device for the next request. + fd.writeCursorFR = nil + fd.writeCursor = 0 + } + + return n, nil +} + +// Readiness implements vfs.FileDescriptionImpl.Readiness. +func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { + var ready waiter.EventMask + ready |= waiter.EventOut // FD is always writable + if !fd.queue.Empty() { + // Have reqs available, FD is readable. + ready |= waiter.EventIn + } + + return ready & mask +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.waitQueue.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *DeviceFD) EventUnregister(e *waiter.Entry) { + fd.waitQueue.EventUnregister(e) } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if !fd.mounted { + return 0, syserror.EPERM + } + return 0, syserror.ENOSYS } -// Register registers the FUSE device with vfsObj. -func Register(vfsObj *vfs.VirtualFilesystem) error { - if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{ - GroupName: "misc", - }); err != nil { +// sendResponse sends a response to the waiting task (if any). +func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error { + // See if the running task need to perform some action before returning. + // Since we just finished writing the future, we can be sure that + // getResponse generates a populated response. + if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil { return err } + // Signal that the queue is no longer full. + select { + case fd.fullQueueCh <- struct{}{}: + default: + } + fd.numActiveRequests -= 1 + + // Signal the task waiting on a response. + close(fut.ch) return nil } -// CreateDevtmpfsFile creates a device special file in devtmpfs. -func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error { - if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil { +// sendError sends an error response to the waiting task (if any). +func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error { + // Return the error to the calling task. + outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + respHdr := linux.FUSEHeaderOut{ + Len: outHdrLen, + Error: errno, + Unique: req.hdr.Unique, + } + + fut, ok := fd.completions[respHdr.Unique] + if !ok { + // Server sent us a response for a request we never sent? + return syserror.EINVAL + } + delete(fd.completions, respHdr.Unique) + + fut.hdr = &respHdr + if err := fd.sendResponse(ctx, fut); err != nil { return err } return nil } + +// noReceiverAction has the calling kernel.Task do some action if its known that no +// receiver is going to be waiting on the future channel. This is to be used by: +// FUSE_INIT. +func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error { + if r.opcode == linux.FUSE_INIT { + // TODO: process init response here. + // Maybe get the creds from the context? + // creds := auth.CredentialsFromContext(ctx) + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go new file mode 100644 index 000000000..fcd77832a --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -0,0 +1,429 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "fmt" + "io" + "math/rand" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" +) + +// echoTestOpcode is the Opcode used during testing. The server used in tests +// will simply echo the payload back with the appropriate headers. +const echoTestOpcode linux.FUSEOpcode = 1000 + +type testPayload struct { + data uint32 +} + +// TestFUSECommunication tests that the communication layer between the Sentry and the +// FUSE server daemon works as expected. +func TestFUSECommunication(t *testing.T) { + s := setup(t) + defer s.Destroy() + + k := kernel.KernelFromContext(s.Ctx) + creds := auth.CredentialsFromContext(s.Ctx) + + // Create test cases with different number of concurrent clients and servers. + testCases := []struct { + Name string + NumClients int + NumServers int + MaxActiveRequests uint64 + }{ + { + Name: "SingleClientSingleServer", + NumClients: 1, + NumServers: 1, + MaxActiveRequests: MaxActiveRequestsDefault, + }, + { + Name: "SingleClientMultipleServers", + NumClients: 1, + NumServers: 10, + MaxActiveRequests: MaxActiveRequestsDefault, + }, + { + Name: "MultipleClientsSingleServer", + NumClients: 10, + NumServers: 1, + MaxActiveRequests: MaxActiveRequestsDefault, + }, + { + Name: "MultipleClientsMultipleServers", + NumClients: 10, + NumServers: 10, + MaxActiveRequests: MaxActiveRequestsDefault, + }, + { + Name: "RequestCapacityFull", + NumClients: 10, + NumServers: 1, + MaxActiveRequests: 1, + }, + { + Name: "RequestCapacityContinuouslyFull", + NumClients: 100, + NumServers: 2, + MaxActiveRequests: 2, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.Name, func(t *testing.T) { + conn, fd, err := newTestConnection(s, k, testCase.MaxActiveRequests) + if err != nil { + t.Fatalf("newTestConnection: %v", err) + } + + clientsDone := make([]chan struct{}, testCase.NumClients) + serversDone := make([]chan struct{}, testCase.NumServers) + serversKill := make([]chan struct{}, testCase.NumServers) + + // FUSE clients. + for i := 0; i < testCase.NumClients; i++ { + clientsDone[i] = make(chan struct{}) + go func(i int) { + fuseClientRun(t, s, k, conn, creds, uint32(i), uint64(i), clientsDone[i]) + }(i) + } + + // FUSE servers. + for j := 0; j < testCase.NumServers; j++ { + serversDone[j] = make(chan struct{}) + serversKill[j] = make(chan struct{}, 1) // The kill command shouldn't block. + go func(j int) { + fuseServerRun(t, s, k, fd, serversDone[j], serversKill[j]) + }(j) + } + + // Tear down. + // + // Make sure all the clients are done. + for i := 0; i < testCase.NumClients; i++ { + <-clientsDone[i] + } + + // Kill any server that is potentially waiting. + for j := 0; j < testCase.NumServers; j++ { + serversKill[j] <- struct{}{} + } + + // Make sure all the servers are done. + for j := 0; j < testCase.NumServers; j++ { + <-serversDone[j] + } + }) + } +} + +// CallTest makes a request to the server and blocks the invoking +// goroutine until a server responds with a response. Doesn't block +// a kernel.Task. Analogous to Connection.Call but used for testing. +func CallTest(conn *Connection, t *kernel.Task, r *Request, i uint32) (*Response, error) { + conn.fd.mu.Lock() + + // Wait until we're certain that a new request can be processed. + for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests { + conn.fd.mu.Unlock() + select { + case <-conn.fd.fullQueueCh: + } + conn.fd.mu.Lock() + } + + fut, err := conn.callFutureLocked(t, r) // No task given. + conn.fd.mu.Unlock() + + if err != nil { + return nil, err + } + + // Resolve the response. + // + // Block without a task. + select { + case <-fut.ch: + } + + // A response is ready. Resolve and return it. + return fut.getResponse(), nil +} + +// ReadTest is analogous to vfs.FileDescription.Read and reads from the FUSE +// device. However, it does so by - not blocking the task that is calling - and +// instead just waits on a channel. The behaviour is essentially the same as +// DeviceFD.Read except it guarantees that the task is not blocked. +func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem.IOSequence, killServer chan struct{}) (int64, bool, error) { + var err error + var n, total int64 + + dev := fd.Impl().(*DeviceFD) + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + dev.EventRegister(&w, waiter.EventIn) + for { + // Issue the request and break out if it completes with anything other than + // "would block". + n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{}) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + // Emulate the blocking for when no requests are available + select { + case <-ch: + case <-killServer: + // Server killed by the main program. + return 0, true, nil + } + } + + dev.EventUnregister(&w) + return total, false, err +} + +// fuseClientRun emulates all the actions of a normal FUSE request. It creates +// a header, a payload, calls the server, waits for the response, and processes +// the response. +func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *Connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) { + defer func() { clientDone <- struct{}{} }() + + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + clientTask, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("fuse-client-%v", pid), tc, s.MntNs, s.Root, s.Root) + if err != nil { + t.Fatal(err) + } + testObj := &testPayload{ + data: rand.Uint32(), + } + + req, err := conn.NewRequest(creds, pid, inode, echoTestOpcode, testObj) + if err != nil { + t.Fatalf("NewRequest creation failed: %v", err) + } + + // Queue up a request. + // Analogous to Call except it doesn't block on the task. + resp, err := CallTest(conn, clientTask, req, pid) + if err != nil { + t.Fatalf("CallTaskNonBlock failed: %v", err) + } + + if err = resp.Error(); err != nil { + t.Fatalf("Server responded with an error: %v", err) + } + + var respTestPayload testPayload + if err := resp.UnmarshalPayload(&respTestPayload); err != nil { + t.Fatalf("Unmarshalling payload error: %v", err) + } + + if resp.hdr.Unique != req.hdr.Unique { + t.Fatalf("got response for another request. Expected response for req %v but got response for req %v", + req.hdr.Unique, resp.hdr.Unique) + } + + if respTestPayload.data != testObj.data { + t.Fatalf("read incorrect data. Data expected: %v, but got %v", testObj.data, respTestPayload.data) + } + +} + +// fuseServerRun creates a task and emulates all the actions of a simple FUSE server +// that simply reads a request and echos the same struct back as a response using the +// appropriate headers. +func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.FileDescription, serverDone, killServer chan struct{}) { + defer func() { serverDone <- struct{}{} }() + + // Create the tasks that the server will be using. + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + var readPayload testPayload + + serverTask, err := testutil.CreateTask(s.Ctx, "fuse-server", tc, s.MntNs, s.Root, s.Root) + if err != nil { + t.Fatal(err) + } + + // Read the request. + for { + inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes()) + payloadLen := uint32(readPayload.SizeBytes()) + + // The raed buffer must meet some certain size criteria. + buffSize := inHdrLen + payloadLen + if buffSize < linux.FUSE_MIN_READ_BUFFER { + buffSize = linux.FUSE_MIN_READ_BUFFER + } + inBuf := make([]byte, buffSize) + inIOseq := usermem.BytesIOSequence(inBuf) + + n, serverKilled, err := ReadTest(serverTask, fd, inIOseq, killServer) + if err != nil { + t.Fatalf("Read failed :%v", err) + } + + // Server should shut down. No new requests are going to be made. + if serverKilled { + break + } + + if n <= 0 { + t.Fatalf("Read read no bytes") + } + + var readFUSEHeaderIn linux.FUSEHeaderIn + readFUSEHeaderIn.UnmarshalUnsafe(inBuf[:inHdrLen]) + readPayload.UnmarshalUnsafe(inBuf[inHdrLen : inHdrLen+payloadLen]) + + if readFUSEHeaderIn.Opcode != echoTestOpcode { + t.Fatalf("read incorrect data. Header: %v, Payload: %v", readFUSEHeaderIn, readPayload) + } + + // Write the response. + outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + outBuf := make([]byte, outHdrLen+payloadLen) + outHeader := linux.FUSEHeaderOut{ + Len: outHdrLen + payloadLen, + Error: 0, + Unique: readFUSEHeaderIn.Unique, + } + + // Echo the payload back. + outHeader.MarshalUnsafe(outBuf[:outHdrLen]) + readPayload.MarshalUnsafe(outBuf[outHdrLen:]) + outIOseq := usermem.BytesIOSequence(outBuf) + + n, err = fd.Write(s.Ctx, outIOseq, vfs.WriteOptions{}) + if err != nil { + t.Fatalf("Write failed :%v", err) + } + } +} + +func setup(t *testing.T) *testutil.System { + k, err := testutil.Boot() + if err != nil { + t.Fatalf("Error creating kernel: %v", err) + } + + ctx := k.SupervisorContext() + creds := auth.CredentialsFromContext(ctx) + + k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + AllowUserMount: true, + }) + + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("NewMountNamespace(): %v", err) + } + + return testutil.NewSystem(ctx, t, k.VFS(), mntns) +} + +// newTestConnection creates a fuse connection that the sentry can communicate with +// and the FD for the server to communicate with. +func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*Connection, *vfs.FileDescription, error) { + vfsObj := &vfs.VirtualFilesystem{} + fuseDev := &DeviceFD{} + + if err := vfsObj.Init(); err != nil { + return nil, nil, err + } + + vd := vfsObj.NewAnonVirtualDentry("genCountFD") + defer vd.DecRef() + if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil { + return nil, nil, err + } + + fsopts := filesystemOptions{ + maxActiveRequests: maxActiveRequests, + } + fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd) + if err != nil { + return nil, nil, err + } + + return fs.conn, &fuseDev.vfsfd, nil +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (t *testPayload) SizeBytes() int { + return 4 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (t *testPayload) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], t.data) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (t *testPayload) UnmarshalBytes(src []byte) { + *t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])} +} + +// Packed implements marshal.Marshallable.Packed. +func (t *testPayload) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (t *testPayload) MarshalUnsafe(dst []byte) { + t.MarshalBytes(dst) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (t *testPayload) UnmarshalUnsafe(src []byte) { + t.UnmarshalBytes(src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + panic("not implemented") +} + +// CopyOut implements marshal.Marshallable.CopyOut. +func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + panic("not implemented") +} + +// CopyIn implements marshal.Marshallable.CopyIn. +func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + panic("not implemented") +} + +// WriteTo implements io.WriterTo.WriteTo. +func (t *testPayload) WriteTo(w io.Writer) (int64, error) { + panic("not implemented") +} diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go new file mode 100644 index 000000000..911b6f7cb --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -0,0 +1,224 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fuse implements fusefs. +package fuse + +import ( + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Name is the default filesystem name. +const Name = "fuse" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +type filesystemOptions struct { + // userID specifies the numeric uid of the mount owner. + // This option should not be specified by the filesystem owner. + // It is set by libfuse (or, if libfuse is not used, must be set + // by the filesystem itself). For more information, see man page + // for fuse(8) + userID uint32 + + // groupID specifies the numeric gid of the mount owner. + // This option should not be specified by the filesystem owner. + // It is set by libfuse (or, if libfuse is not used, must be set + // by the filesystem itself). For more information, see man page + // for fuse(8) + groupID uint32 + + // rootMode specifies the the file mode of the filesystem's root. + rootMode linux.FileMode + + // maxActiveRequests specifies the maximum number of active requests that can + // exist at any time. Any further requests will block when trying to + // Call the server. + maxActiveRequests uint64 +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem + devMinor uint32 + + // conn is used for communication between the FUSE server + // daemon and the sentry fusefs. + conn *Connection + + // opts is the options the fusefs is initialized with. + opts *filesystemOptions +} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + var fsopts filesystemOptions + mopts := vfs.GenericParseMountOptions(opts.Data) + deviceDescriptorStr, ok := mopts["fd"] + if !ok { + log.Warningf("%s.GetFilesystem: communication file descriptor N (obtained by opening /dev/fuse) must be specified as 'fd=N'", fsType.Name()) + return nil, nil, syserror.EINVAL + } + delete(mopts, "fd") + + deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */) + if err != nil { + return nil, nil, err + } + + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name()) + return nil, nil, syserror.EINVAL + } + fuseFd := kernelTask.GetFileVFS2(int32(deviceDescriptor)) + + // Parse and set all the other supported FUSE mount options. + // TODO(gVisor.dev/issue/3229): Expand the supported mount options. + if userIDStr, ok := mopts["user_id"]; ok { + delete(mopts, "user_id") + userID, err := strconv.ParseUint(userIDStr, 10, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), userIDStr) + return nil, nil, syserror.EINVAL + } + fsopts.userID = uint32(userID) + } + + if groupIDStr, ok := mopts["group_id"]; ok { + delete(mopts, "group_id") + groupID, err := strconv.ParseUint(groupIDStr, 10, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), groupIDStr) + return nil, nil, syserror.EINVAL + } + fsopts.groupID = uint32(groupID) + } + + rootMode := linux.FileMode(0777) + modeStr, ok := mopts["rootmode"] + if ok { + delete(mopts, "rootmode") + mode, err := strconv.ParseUint(modeStr, 8, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr) + return nil, nil, syserror.EINVAL + } + rootMode = linux.FileMode(mode) + } + fsopts.rootMode = rootMode + + // Set the maxInFlightRequests option. + fsopts.maxActiveRequests = MaxActiveRequestsDefault + + // Check for unparsed options. + if len(mopts) != 0 { + log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts) + return nil, nil, syserror.EINVAL + } + + // Create a new FUSE filesystem. + fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd) + if err != nil { + log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err) + return nil, nil, err + } + + fs.VFSFilesystem().Init(vfsObj, &fsType, fs) + + // TODO: dispatch a FUSE_INIT request to the FUSE daemon server before + // returning. Mount will not block on this dispatched request. + + // root is the fusefs root directory. + root := fs.newInode(creds, fsopts.rootMode) + + return fs.VFSFilesystem(), root.VFSDentry(), nil +} + +// NewFUSEFilesystem creates a new FUSE filesystem. +func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) { + fs := &filesystem{ + devMinor: devMinor, + opts: opts, + } + + conn, err := NewFUSEConnection(ctx, device, opts.maxActiveRequests) + if err != nil { + log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) + return nil, syserror.EINVAL + } + + fs.conn = conn + fuseFD := device.Impl().(*DeviceFD) + fuseFD.fs = fs + + return fs, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// Inode implements kernfs.Inode. +type Inode struct { + kernfs.InodeAttrs + kernfs.InodeNoDynamicLookup + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.OrderedChildren + + locks vfs.FileLocks + + dentry kernfs.Dentry +} + +func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { + i := &Inode{} + i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) + i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + i.dentry.Init(i) + + return &i.dentry +} + +// Open implements kernfs.Inode.Open. +func (i *Inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} diff --git a/pkg/sentry/fsimpl/fuse/register.go b/pkg/sentry/fsimpl/fuse/register.go new file mode 100644 index 000000000..b5b581152 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/register.go @@ -0,0 +1,42 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Register registers the FUSE device with vfsObj. +func Register(vfsObj *vfs.VirtualFilesystem) error { + if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{ + GroupName: "misc", + }); err != nil { + return err + } + + return nil +} + +// CreateDevtmpfsFile creates a device special file in devtmpfs. +func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error { + if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil { + return err + } + + return nil +} diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index cd5f5049e..00e3c99cd 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -150,11 +150,9 @@ afterSymlink: return nil, err } if d != d.parent && !d.cachedMetadataAuthoritative() { - _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask()) - if err != nil { + if err := d.parent.updateFromGetattr(ctx); err != nil { return nil, err } - d.parent.updateFromP9Attrs(attrMask, &attr) } rp.Advance() return d.parent, nil @@ -209,17 +207,28 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil // Preconditions: As for getChildLocked. !parent.isSynthetic(). func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) { + if child != nil { + // Need to lock child.metadataMu because we might be updating child + // metadata. We need to hold the lock *before* getting metadata from the + // server and release it after updating local metadata. + child.metadataMu.Lock() + } qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) if err != nil && err != syserror.ENOENT { + if child != nil { + child.metadataMu.Unlock() + } return nil, err } if child != nil { if !file.isNil() && inoFromPath(qid.Path) == child.ino { // The file at this path hasn't changed. Just update cached metadata. file.close(ctx) - child.updateFromP9Attrs(attrMask, &attr) + child.updateFromP9AttrsLocked(attrMask, &attr) + child.metadataMu.Unlock() return child, nil } + child.metadataMu.Unlock() if file.isNil() && child.isSynthetic() { // We have a synthetic file, and no remote file has arisen to // replace it. @@ -1325,7 +1334,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.renameMuRUnlockAndCheckCaching(&ds) return err } - if err := d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount()); err != nil { + if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil { fs.renameMuRUnlockAndCheckCaching(&ds) return err } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index b74d489a0..e20de84b5 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -785,8 +785,8 @@ func (d *dentry) cachedMetadataAuthoritative() bool { // updateFromP9Attrs is called to update d's metadata after an update from the // remote filesystem. -func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { - d.metadataMu.Lock() +// Precondition: d.metadataMu must be locked. +func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { if mask.Mode { if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { d.metadataMu.Unlock() @@ -822,7 +822,6 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { if mask.Size { d.updateFileSizeLocked(attr.Size) } - d.metadataMu.Unlock() } // Preconditions: !d.isSynthetic() @@ -834,6 +833,10 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { file p9file handleMuRLocked bool ) + // d.metadataMu must be locked *before* we getAttr so that we do not end up + // updating stale attributes in d.updateFromP9AttrsLocked(). + d.metadataMu.Lock() + defer d.metadataMu.Unlock() d.handleMu.RLock() if !d.handle.file.isNil() { file = d.handle.file @@ -849,7 +852,7 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { if err != nil { return err } - d.updateFromP9Attrs(attrMask, &attr) + d.updateFromP9AttrsLocked(attrMask, &attr) return nil } @@ -885,7 +888,8 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.DevMinor = d.fs.devMinor } -func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { +func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error { + stat := &opts.Stat if stat.Mask == 0 { return nil } @@ -893,7 +897,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin return syserror.EPERM } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) - if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } if err := mnt.CheckBeginWrite(); err != nil { @@ -934,6 +938,17 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } if !d.isSynthetic() { if stat.Mask != 0 { + if stat.Mask&linux.STATX_SIZE != 0 { + // Check whether to allow a truncate request to be made. + switch d.mode & linux.S_IFMT { + case linux.S_IFREG: + // Allow. + case linux.S_IFDIR: + return syserror.EISDIR + default: + return syserror.EINVAL + } + } if err := d.file.setAttr(ctx, p9.SetAttrMask{ Permissions: stat.Mask&linux.STATX_MODE != 0, UID: stat.Mask&linux.STATX_UID != 0, @@ -1495,7 +1510,7 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount()); err != nil { + if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil { return err } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index f10350c97..09f142cfc 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -155,26 +154,53 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// pwrite returns the number of bytes written, final offset, error. The final +// offset should be ignored by PWrite. +func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, offset, syserror.EOPNOTSUPP + } + + d := fd.dentry() + // If the fd was opened with O_APPEND, make sure the file size is updated. + // There is a possible race here if size is modified externally after + // metadata cache is updated. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { + if err := d.updateFromGetattr(ctx); err != nil { + return 0, offset, err + } } + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + // Set offset to file size if the fd was opened with O_APPEND. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Holding d.metadataMu is sufficient for reading d.size. + offset = int64(d.size) + } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { - return 0, err + return 0, offset, err } src = src.TakeFirst64(limit) + n, err := fd.pwriteLocked(ctx, src, offset, opts) + return n, offset + n, err +} +// Preconditions: fd.dentry().metatdataMu must be locked. +func (fd *regularFileFD) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { d := fd.dentry() - d.metadataMu.Lock() - defer d.metadataMu.Unlock() if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking @@ -194,12 +220,12 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, syserror.EINVAL } mr := memmap.MappableRange{pgstart, pgend} - var freed []platform.FileRange + var freed []memmap.FileRange d.dataMu.Lock() cseg := d.cache.LowerBoundSegment(mr.Start) for cseg.Ok() && cseg.Start() < mr.End { cseg = d.cache.Isolate(cseg, mr) - freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) + freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) cseg = d.cache.Remove(cseg).NextSegment() } d.dataMu.Unlock() @@ -237,8 +263,8 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.mu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off fd.mu.Unlock() return n, err } @@ -794,7 +820,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (d *dentry) InvalidateUnsavable(ctx context.Context) error { - // Whether we have a host fd (and consequently what platform.File is + // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. d.mapsMu.Lock() @@ -842,8 +868,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { } } -// dentryPlatformFile implements platform.File. It exists solely because dentry -// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef. +// dentryPlatformFile implements memmap.File. It exists solely because dentry +// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef. // // dentryPlatformFile is only used when a host FD representing the remote file // is available (i.e. dentry.handle.fd >= 0), and that FD is used for @@ -851,7 +877,7 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { type dentryPlatformFile struct { *dentry - // fdRefs counts references on platform.File offsets. fdRefs is protected + // fdRefs counts references on memmap.File offsets. fdRefs is protected // by dentry.dataMu. fdRefs fsutil.FrameRefSet @@ -863,29 +889,29 @@ type dentryPlatformFile struct { hostFileMapperInitOnce sync.Once } -// IncRef implements platform.File.IncRef. -func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) { d.dataMu.Lock() d.fdRefs.IncRefAndAccount(fr) d.dataMu.Unlock() } -// DecRef implements platform.File.DecRef. -func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { d.dataMu.Lock() d.fdRefs.DecRefAndAccount(fr) d.dataMu.Unlock() } -// MapInternal implements platform.File.MapInternal. -func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { d.handleMu.RLock() bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write) d.handleMu.RUnlock() return bs, err } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (d *dentryPlatformFile) FD() int { d.handleMu.RLock() fd := d.handle.fd diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index a7b53b2d2..811528982 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -16,6 +16,7 @@ package gofer import ( "sync" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -144,7 +145,7 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't // hold here since specialFileFD doesn't client-cache data. Just buffer the // read instead. - if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + if d := fd.dentry(); d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } buf := make([]byte, dst.NumBytes()) @@ -176,39 +177,76 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// pwrite returns the number of bytes written, final offset, error. The final +// offset should be ignored by PWrite. +func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if fd.seekable && offset < 0 { - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, offset, syserror.EOPNOTSUPP + } + + d := fd.dentry() + // If the regular file fd was opened with O_APPEND, make sure the file size + // is updated. There is a possible race here if size is modified externally + // after metadata cache is updated. + if fd.seekable && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { + if err := d.updateFromGetattr(ctx); err != nil { + return 0, offset, err + } } if fd.seekable { + // We need to hold the metadataMu *while* writing to a regular file. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + + // Set offset to file size if the regular file was opened with O_APPEND. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Holding d.metadataMu is sufficient for reading d.size. + offset = int64(d.size) + } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { - return 0, err + return 0, offset, err } src = src.TakeFirst64(limit) } // Do a buffered write. See rationale in PRead. - if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + if d.cachedMetadataAuthoritative() { d.touchCMtime() } buf := make([]byte, src.NumBytes()) // Don't do partial writes if we get a partial read from src. if _, err := src.CopyIn(ctx, buf); err != nil { - return 0, err + return 0, offset, err } n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) if err == syserror.EAGAIN { err = syserror.ErrWouldBlock } - return int64(n), err + finalOff = offset + // Update file size for regular files. + if fd.seekable { + finalOff += int64(n) + // d.metadataMu is already locked at this point. + if uint64(finalOff) > d.size { + d.dataMu.Lock() + defer d.dataMu.Unlock() + atomic.StoreUint64(&d.size, uint64(finalOff)) + } + } + return int64(n), finalOff, err } // Write implements vfs.FileDescriptionImpl.Write. @@ -218,8 +256,8 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts } fd.mu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off fd.mu.Unlock() return n, err } diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index e86fbe2d5..bd701bbc7 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -34,7 +34,6 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/platform", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 1a88cb657..c894f2ca0 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -373,7 +373,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { // SetStat implements kernfs.Inode. func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { - s := opts.Stat + s := &opts.Stat m := s.Mask if m == 0 { @@ -386,7 +386,7 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if err := syscall.Fstat(i.hostFD, &hostStat); err != nil { return err } - if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { return err } @@ -396,6 +396,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre } } if m&linux.STATX_SIZE != 0 { + if hostStat.Mode&linux.S_IFMT != linux.S_IFREG { + return syserror.EINVAL + } if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { return err } diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go index 8545a82f0..65d3af38c 100644 --- a/pkg/sentry/fsimpl/host/mmap.go +++ b/pkg/sentry/fsimpl/host/mmap.go @@ -19,13 +19,12 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) -// inodePlatformFile implements platform.File. It exists solely because inode -// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef. +// inodePlatformFile implements memmap.File. It exists solely because inode +// cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef. // // inodePlatformFile should only be used if inode.canMap is true. type inodePlatformFile struct { @@ -34,7 +33,7 @@ type inodePlatformFile struct { // fdRefsMu protects fdRefs. fdRefsMu sync.Mutex - // fdRefs counts references on platform.File offsets. It is used solely for + // fdRefs counts references on memmap.File offsets. It is used solely for // memory accounting. fdRefs fsutil.FrameRefSet @@ -45,32 +44,32 @@ type inodePlatformFile struct { fileMapperInitOnce sync.Once } -// IncRef implements platform.File.IncRef. +// IncRef implements memmap.File.IncRef. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) IncRef(fr platform.FileRange) { +func (i *inodePlatformFile) IncRef(fr memmap.FileRange) { i.fdRefsMu.Lock() i.fdRefs.IncRefAndAccount(fr) i.fdRefsMu.Unlock() } -// DecRef implements platform.File.DecRef. +// DecRef implements memmap.File.DecRef. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) DecRef(fr platform.FileRange) { +func (i *inodePlatformFile) DecRef(fr memmap.FileRange) { i.fdRefsMu.Lock() i.fdRefs.DecRefAndAccount(fr) i.fdRefsMu.Unlock() } -// MapInternal implements platform.File.MapInternal. +// MapInternal implements memmap.File.MapInternal. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +func (i *inodePlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return i.fileMapper.MapInternal(fr, i.hostFD, at.Write) } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (i *inodePlatformFile) FD() int { return i.hostFD } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 3f0aea73a..1d37ccb98 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -112,7 +112,7 @@ func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) } -// Release implements vfs.FileDecriptionImpl.Release. +// Release implements vfs.FileDescriptionImpl.Release. func (fd *GenericDirectoryFD) Release() {} func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem { @@ -123,7 +123,7 @@ func (fd *GenericDirectoryFD) inode() Inode { return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode } -// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds // o.mu when calling cb. func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { fd.mu.Lock() @@ -198,7 +198,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent return err } -// Seek implements vfs.FileDecriptionImpl.Seek. +// Seek implements vfs.FileDescriptionImpl.Seek. func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 2ab3f1761..579e627f0 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -267,7 +267,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { return err } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index ff82e1f20..6b705e955 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -1104,7 +1104,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) - if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts.Stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } mnt := rp.Mount() diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go index a3c1f7a8d..c0749e711 100644 --- a/pkg/sentry/fsimpl/overlay/non_directory.go +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -151,7 +151,7 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { d := fd.dentry() mode := linux.FileMode(atomic.LoadUint32(&d.mode)) - if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } mnt := fd.vfsfd.Mount() @@ -176,7 +176,7 @@ func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) return nil } -// StatFS implements vfs.FileDesciptionImpl.StatFS. +// StatFS implements vfs.FileDescriptionImpl.StatFS. func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) { return fd.filesystem().statFS(ctx) } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index dad4db1a7..79c2725f3 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -128,7 +128,7 @@ func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallbac return fd.GenericDirectoryFD.IterDirents(ctx, cb) } -// Seek implements vfs.FileDecriptionImpl.Seek. +// Seek implements vfs.FileDescriptionImpl.Seek. func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { if fd.task.ExitState() >= kernel.TaskExitZombie { return 0, syserror.ENOENT diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index a0f20c2d4..ef210a69b 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -649,7 +649,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.mu.RUnlock() return err } - if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil { + if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil { fs.mu.RUnlock() return err } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 1cdb46e6f..abbaa5d60 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -325,8 +325,15 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// pwrite returns the number of bytes written, final offset and error. The +// final offset should be ignored by PWrite. +func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL } // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since @@ -334,40 +341,44 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { - return 0, syserror.EOPNOTSUPP + return 0, offset, syserror.EOPNOTSUPP } srclen := src.NumBytes() if srclen == 0 { - return 0, nil + return 0, offset, nil } f := fd.inode().impl.(*regularFile) + f.inode.mu.Lock() + defer f.inode.mu.Unlock() + // If the file is opened with O_APPEND, update offset to file size. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Locking f.inode.mu is sufficient for reading f.size. + offset = int64(f.size) + } if end := offset + srclen; end < offset { // Overflow. - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL } - var err error srclen, err = vfs.CheckLimit(ctx, offset, srclen) if err != nil { - return 0, err + return 0, offset, err } src = src.TakeFirst64(srclen) - f.inode.mu.Lock() rw := getRegularFileReadWriter(f, offset) n, err := src.CopyInTo(ctx, rw) - fd.inode().touchCMtimeLocked() - f.inode.mu.Unlock() + f.inode.touchCMtimeLocked() putRegularFileReadWriter(rw) - return n, err + return n, n + offset, err } // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.offMu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off fd.offMu.Unlock() return n, err } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index d7f4f0779..2545d88e9 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -452,7 +452,8 @@ func (i *inode) statTo(stat *linux.Statx) { } } -func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error { +func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error { + stat := &opts.Stat if stat.Mask == 0 { return nil } @@ -460,7 +461,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return syserror.EPERM } mode := linux.FileMode(atomic.LoadUint32(&i.mode)) - if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { return err } i.mu.Lock() @@ -695,7 +696,7 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) d := fd.dentry() - if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil { + if err := d.inode.setStat(ctx, creds, &opts); err != nil { return err } diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 25fe1921b..f6886a758 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -132,6 +132,7 @@ go_library( "task_stop.go", "task_syscall.go", "task_usermem.go", + "task_work.go", "thread_group.go", "threads.go", "timekeeper.go", diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 732e66da4..bcc1b29a8 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -717,10 +717,10 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3 } } -// UnlockPI unlock the futex following the Priority-inheritance futex -// rules. The address provided must contain the caller's TID. If there are -// waiters, TID of the next waiter (FIFO) is set to the given address, and the -// waiter woken up. If there are no waiters, 0 is set to the address. +// UnlockPI unlocks the futex following the Priority-inheritance futex rules. +// The address provided must contain the caller's TID. If there are waiters, +// TID of the next waiter (FIFO) is set to the given address, and the waiter +// woken up. If there are no waiters, 0 is set to the address. func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error { k, err := getKey(t, addr, private) if err != nil { diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 240cd6fe0..15dae0f5b 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -1469,6 +1469,11 @@ func (k *Kernel) NowMonotonic() int64 { return now } +// AfterFunc implements tcpip.Clock.AfterFunc. +func (k *Kernel) AfterFunc(d time.Duration, f func()) tcpip.Timer { + return ktime.TcpipAfterFunc(k.realtimeClock, d, f) +} + // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or // LoadFrom. func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index bfd779837..c211fc8d0 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index f66cfcc7f..55b4c2cdb 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -45,7 +45,6 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -370,7 +369,7 @@ type Shm struct { // fr is the offset into mfp.MemoryFile() that backs this contents of this // segment. Immutable. - fr platform.FileRange + fr memmap.FileRange // mu protects all fields below. mu sync.Mutex `state:"nosave"` diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index f48247c94..c4db05bd8 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -68,6 +68,21 @@ type Task struct { // runState is exclusive to the task goroutine. runState taskRunState + // taskWorkCount represents the current size of the task work queue. It is + // used to avoid acquiring taskWorkMu when the queue is empty. + // + // Must accessed with atomic memory operations. + taskWorkCount int32 + + // taskWorkMu protects taskWork. + taskWorkMu sync.Mutex `state:"nosave"` + + // taskWork is a queue of work to be executed before resuming user execution. + // It is similar to the task_work mechanism in Linux. + // + // taskWork is exclusive to the task goroutine. + taskWork []TaskWorker + // haveSyscallReturn is true if tc.Arch().Return() represents a value // returned by a syscall (or set by ptrace after a syscall). // @@ -550,6 +565,10 @@ type Task struct { // futexWaiter is exclusive to the task goroutine. futexWaiter *futex.Waiter `state:"nosave"` + // robustList is a pointer to the head of the tasks's robust futex + // list. + robustList usermem.Addr + // startTime is the real time at which the task started. It is set when // a Task is created or invokes execve(2). // diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 9b69f3cbe..7803b98d0 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -207,6 +207,9 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { return flags.CloseOnExec }) + // Handle the robust futex list. + t.exitRobustList() + // NOTE(b/30815691): We currently do not implement privileged // executables (set-user/group-ID bits and file capabilities). This // allows us to unconditionally enable user dumpability on the new mm. diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index c4ade6e8e..231ac548a 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -253,6 +253,9 @@ func (*runExitMain) execute(t *Task) taskRunState { } } + // Handle the robust futex list. + t.exitRobustList() + // Deactivate the address space and update max RSS before releasing the // task's MM. t.Deactivate() diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index a53e77c9f..4b535c949 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -15,6 +15,7 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/usermem" ) @@ -52,3 +53,127 @@ func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) { func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) { return t.MemoryManager().GetSharedFutexKey(t, addr) } + +// GetRobustList sets the robust futex list for the task. +func (t *Task) GetRobustList() usermem.Addr { + t.mu.Lock() + addr := t.robustList + t.mu.Unlock() + return addr +} + +// SetRobustList sets the robust futex list for the task. +func (t *Task) SetRobustList(addr usermem.Addr) { + t.mu.Lock() + t.robustList = addr + t.mu.Unlock() +} + +// exitRobustList walks the robust futex list, marking locks dead and notifying +// wakers. It corresponds to Linux's exit_robust_list(). Following Linux, +// errors are silently ignored. +func (t *Task) exitRobustList() { + t.mu.Lock() + addr := t.robustList + t.robustList = 0 + t.mu.Unlock() + + if addr == 0 { + return + } + + var rl linux.RobustListHead + if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil { + return + } + + next := rl.List + done := 0 + var pendingLockAddr usermem.Addr + if rl.ListOpPending != 0 { + pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset) + } + + // Wake up normal elements. + for usermem.Addr(next) != addr { + // We traverse to the next element of the list before we + // actually wake anything. This prevents the race where waking + // this futex causes a modification of the list. + thisLockAddr := usermem.Addr(next + rl.FutexOffset) + + // Try to decode the next element in the list before waking the + // current futex. But don't check the error until after we've + // woken the current futex. Linux does it in this order too + _, nextErr := t.CopyIn(usermem.Addr(next), &next) + + // Wakeup the current futex if it's not pending. + if thisLockAddr != pendingLockAddr { + t.wakeRobustListOne(thisLockAddr) + } + + // If there was an error copying the next futex, we must bail. + if nextErr != nil { + break + } + + // This is a user structure, so it could be a massive list, or + // even contain a loop if they are trying to mess with us. We + // cap traversal to prevent that. + done++ + if done >= linux.ROBUST_LIST_LIMIT { + break + } + } + + // Is there a pending entry to wake? + if pendingLockAddr != 0 { + t.wakeRobustListOne(pendingLockAddr) + } +} + +// wakeRobustListOne wakes a single futex from the robust list. +func (t *Task) wakeRobustListOne(addr usermem.Addr) { + // Bit 0 in address signals PI futex. + pi := addr&1 == 1 + addr = addr &^ 1 + + // Load the futex. + f, err := t.LoadUint32(addr) + if err != nil { + // Can't read this single value? Ignore the problem. + // We can wake the other futexes in the list. + return + } + + tid := uint32(t.ThreadID()) + for { + // Is this held by someone else? + if f&linux.FUTEX_TID_MASK != tid { + return + } + + // This thread is dying and it's holding this futex. We need to + // set the owner died bit and wake up any waiters. + newF := (f & linux.FUTEX_WAITERS) | linux.FUTEX_OWNER_DIED + if curF, err := t.CompareAndSwapUint32(addr, f, newF); err != nil { + return + } else if curF != f { + // Futex changed out from under us. Try again... + f = curF + continue + } + + // Wake waiters if there are any. + if f&linux.FUTEX_WAITERS != 0 { + private := f&linux.FUTEX_PRIVATE_FLAG != 0 + if pi { + t.Futex().UnlockPI(t, addr, tid, private) + return + } + t.Futex().Wake(t, addr, private, linux.FUTEX_BITSET_MATCH_ANY, 1) + } + + // Done. + return + } +} diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index d654dd997..7d4f44caf 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -167,7 +167,22 @@ func (app *runApp) execute(t *Task) taskRunState { return (*runInterrupt)(nil) } - // We're about to switch to the application again. If there's still a + // Execute any task work callbacks before returning to user space. + if atomic.LoadInt32(&t.taskWorkCount) > 0 { + t.taskWorkMu.Lock() + queue := t.taskWork + t.taskWork = nil + atomic.StoreInt32(&t.taskWorkCount, 0) + t.taskWorkMu.Unlock() + + // Do not hold taskWorkMu while executing task work, which may register + // more work. + for _, work := range queue { + work.TaskWork(t) + } + } + + // We're about to switch to the application again. If there's still an // unhandled SyscallRestartErrno that wasn't translated to an EINTR, // restart the syscall that was interrupted. If there's a saved signal // mask, restore it. (Note that restoring the saved signal mask may unblock diff --git a/pkg/sentry/kernel/task_work.go b/pkg/sentry/kernel/task_work.go new file mode 100644 index 000000000..dda5a433a --- /dev/null +++ b/pkg/sentry/kernel/task_work.go @@ -0,0 +1,38 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import "sync/atomic" + +// TaskWorker is a deferred task. +// +// This must be savable. +type TaskWorker interface { + // TaskWork will be executed prior to returning to user space. Note that + // TaskWork may call RegisterWork again, but this will not be executed until + // the next return to user space, unlike in Linux. This effectively allows + // registration of indefinite user return hooks, but not by default. + TaskWork(t *Task) +} + +// RegisterWork can be used to register additional task work that will be +// performed prior to returning to user space. See TaskWorker.TaskWork for +// semantics regarding registration. +func (t *Task) RegisterWork(work TaskWorker) { + t.taskWorkMu.Lock() + defer t.taskWorkMu.Unlock() + atomic.AddInt32(&t.taskWorkCount, 1) + t.taskWork = append(t.taskWork, work) +} diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 7ba7dc50c..2817aa3ba 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -6,6 +6,7 @@ go_library( name = "time", srcs = [ "context.go", + "tcpip.go", "time.go", ], visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/kernel/time/tcpip.go b/pkg/sentry/kernel/time/tcpip.go new file mode 100644 index 000000000..c4474c0cf --- /dev/null +++ b/pkg/sentry/kernel/time/tcpip.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package time + +import ( + "sync" + "time" +) + +// TcpipAfterFunc waits for duration to elapse according to clock then runs fn. +// The timer is started immediately and will fire exactly once. +func TcpipAfterFunc(clock Clock, duration time.Duration, fn func()) *TcpipTimer { + timer := &TcpipTimer{ + clock: clock, + } + timer.notifier = functionNotifier{ + fn: func() { + // tcpip.Timer.Stop() explicitly states that the function is called in a + // separate goroutine that Stop() does not synchronize with. + // Timer.Destroy() synchronizes with calls to TimerListener.Notify(). + // This is semantically meaningful because, in the former case, it's + // legal to call tcpip.Timer.Stop() while holding locks that may also be + // taken by the function, but this isn't so in the latter case. Most + // immediately, Timer calls TimerListener.Notify() while holding + // Timer.mu. A deadlock occurs without spawning a goroutine: + // T1: (Timer expires) + // => Timer.Tick() <- Timer.mu.Lock() called + // => TimerListener.Notify() + // => Timer.Stop() + // => Timer.Destroy() <- Timer.mu.Lock() called, deadlock! + // + // Spawning a goroutine avoids the deadlock: + // T1: (Timer expires) + // => Timer.Tick() <- Timer.mu.Lock() called + // => TimerListener.Notify() <- Launches T2 + // T2: + // => Timer.Stop() + // => Timer.Destroy() <- Timer.mu.Lock() called, blocks + // T1: + // => (returns) <- Timer.mu.Unlock() called + // T2: + // => (continues) <- No deadlock! + go func() { + timer.Stop() + fn() + }() + }, + } + timer.Reset(duration) + return timer +} + +// TcpipTimer is a resettable timer with variable duration expirations. +// Implements tcpip.Timer, which does not define a Destroy method; instead, all +// resources are released after timer expiration and calls to Timer.Stop. +// +// Must be created by AfterFunc. +type TcpipTimer struct { + // clock is the time source. clock is immutable. + clock Clock + + // notifier is called when the Timer expires. notifier is immutable. + notifier functionNotifier + + // mu protects t. + mu sync.Mutex + + // t stores the latest running Timer. This is replaced whenever Reset is + // called since Timer cannot be restarted once it has been Destroyed by Stop. + // + // This field is nil iff Stop has been called. + t *Timer +} + +// Stop implements tcpip.Timer.Stop. +func (r *TcpipTimer) Stop() bool { + r.mu.Lock() + defer r.mu.Unlock() + + if r.t == nil { + return false + } + _, lastSetting := r.t.Swap(Setting{}) + r.t.Destroy() + r.t = nil + return lastSetting.Enabled +} + +// Reset implements tcpip.Timer.Reset. +func (r *TcpipTimer) Reset(d time.Duration) { + r.mu.Lock() + defer r.mu.Unlock() + + if r.t == nil { + r.t = NewTimer(r.clock, &r.notifier) + } + + r.t.Swap(Setting{ + Enabled: true, + Period: 0, + Next: r.clock.Now().Add(d), + }) +} + +// functionNotifier is a TimerListener that runs a function. +// +// functionNotifier cannot be saved or loaded. +type functionNotifier struct { + fn func() +} + +// Notify implements ktime.TimerListener.Notify. +func (f *functionNotifier) Notify(uint64, Setting) (Setting, bool) { + f.fn() + return Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (f *functionNotifier) Destroy() {} diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index 5f3908d8b..7c4fefb16 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/log" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sync" ) @@ -90,7 +90,7 @@ type Timekeeper struct { // NewTimekeeper does not take ownership of paramPage. // // SetClocks must be called on the returned Timekeeper before it is usable. -func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) { +func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) (*Timekeeper, error) { return &Timekeeper{ params: NewVDSOParamPage(mfp, paramPage), }, nil diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go index f1b3c212c..290c32466 100644 --- a/pkg/sentry/kernel/vdso.go +++ b/pkg/sentry/kernel/vdso.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -58,7 +58,7 @@ type vdsoParams struct { type VDSOParamPage struct { // The parameter page is fr, allocated from mfp.MemoryFile(). mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange // seq is the current sequence count written to the page. // @@ -81,7 +81,7 @@ type VDSOParamPage struct { // * VDSOParamPage must be the only writer to fr. // // * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block. -func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage { +func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage { return &VDSOParamPage{mfp: mfp, fr: fr} } diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index a98b66de1..2c95669cd 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -28,9 +28,21 @@ go_template_instance( }, ) +go_template_instance( + name = "file_range", + out = "file_range.go", + package = "memmap", + prefix = "File", + template = "//pkg/segment:generic_range", + types = { + "T": "uint64", + }, +) + go_library( name = "memmap", srcs = [ + "file_range.go", "mappable_range.go", "mapping_set.go", "mapping_set_impl.go", @@ -40,7 +52,7 @@ go_library( deps = [ "//pkg/context", "//pkg/log", - "//pkg/sentry/platform", + "//pkg/safemem", "//pkg/syserror", "//pkg/usermem", ], diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index c6db9fc8f..c188f6c29 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -19,12 +19,12 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/usermem" ) // Mappable represents a memory-mappable object, a mutable mapping from uint64 -// offsets to (platform.File, uint64 File offset) pairs. +// offsets to (File, uint64 File offset) pairs. // // See mm/mm.go for Mappable's place in the lock order. // @@ -74,7 +74,7 @@ type Mappable interface { // Translations are valid until invalidated by a callback to // MappingSpace.Invalidate or until the caller removes its mapping of the // translated range. Mappable implementations must ensure that at least one - // reference is held on all pages in a platform.File that may be the result + // reference is held on all pages in a File that may be the result // of a valid Translation. // // Preconditions: required.Length() > 0. optional.IsSupersetOf(required). @@ -100,7 +100,7 @@ type Translation struct { Source MappableRange // File is the mapped file. - File platform.File + File File // Offset is the offset into File at which this Translation begins. Offset uint64 @@ -110,9 +110,9 @@ type Translation struct { Perms usermem.AccessType } -// FileRange returns the platform.FileRange represented by t. -func (t Translation) FileRange() platform.FileRange { - return platform.FileRange{t.Offset, t.Offset + t.Source.Length()} +// FileRange returns the FileRange represented by t. +func (t Translation) FileRange() FileRange { + return FileRange{t.Offset, t.Offset + t.Source.Length()} } // CheckTranslateResult returns an error if (ts, terr) does not satisfy all @@ -361,3 +361,49 @@ type MMapOpts struct { // TODO(jamieliu): Replace entirely with MappingIdentity? Hint string } + +// File represents a host file that may be mapped into an platform.AddressSpace. +type File interface { + // All pages in a File are reference-counted. + + // IncRef increments the reference count on all pages in fr. + // + // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > + // 0. At least one reference must be held on all pages in fr. (The File + // interface does not provide a way to acquire an initial reference; + // implementors may define mechanisms for doing so.) + IncRef(fr FileRange) + + // DecRef decrements the reference count on all pages in fr. + // + // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > + // 0. At least one reference must be held on all pages in fr. + DecRef(fr FileRange) + + // MapInternal returns a mapping of the given file offsets in the invoking + // process' address space for reading and writing. + // + // Note that fr.Start and fr.End need not be page-aligned. + // + // Preconditions: fr.Length() > 0. At least one reference must be held on + // all pages in fr. + // + // Postconditions: The returned mapping is valid as long as at least one + // reference is held on the mapped pages. + MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error) + + // FD returns the file descriptor represented by the File. + // + // The only permitted operation on the returned file descriptor is to map + // pages from it consistent with the requirements of AddressSpace.MapFile. + FD() int +} + +// FileRange represents a range of uint64 offsets into a File. +// +// type FileRange <generated using go_generics> + +// String implements fmt.Stringer.String. +func (fr FileRange) String() string { + return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) +} diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index a036ce53c..f9d0837a1 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -7,14 +7,14 @@ go_template_instance( name = "file_refcount_set", out = "file_refcount_set.go", imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "mm", prefix = "fileRefcount", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "int32", "Functions": "fileRefcountSetFunctions", }, diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 379148903..1999ec706 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -243,7 +242,7 @@ type aioMappable struct { refs.AtomicRefCount mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange } var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp()) diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 6db7c3d40..3e85964e4 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -25,7 +25,7 @@ // Locks taken by memmap.Mappable.Translate // mm.privateRefs.mu // platform.AddressSpace locks -// platform.File locks +// memmap.File locks // mm.aioManager.mu // mm.AIOContext.mu // @@ -396,7 +396,7 @@ type pma struct { // file is the file mapped by this pma. Only pmas for which file == // MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to // the corresponding file range while they exist. - file platform.File `state:"nosave"` + file memmap.File `state:"nosave"` // off is the offset into file at which this pma begins. // @@ -436,7 +436,7 @@ type pma struct { private bool // If internalMappings is not empty, it is the cached return value of - // file.MapInternal for the platform.FileRange mapped by this pma. + // file.MapInternal for the memmap.FileRange mapped by this pma. internalMappings safemem.BlockSeq `state:"nosave"` } @@ -469,10 +469,10 @@ func (fileRefcountSetFunctions) MaxKey() uint64 { func (fileRefcountSetFunctions) ClearValue(_ *int32) { } -func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) { +func (fileRefcountSetFunctions) Merge(_ memmap.FileRange, rc1 int32, _ memmap.FileRange, rc2 int32) (int32, bool) { return rc1, rc1 == rc2 } -func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) { +func (fileRefcountSetFunctions) Split(_ memmap.FileRange, rc int32, _ uint64) (int32, int32) { return rc, rc } diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go index 62e4c20af..930ec895f 100644 --- a/pkg/sentry/mm/pma.go +++ b/pkg/sentry/mm/pma.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -604,7 +603,7 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat } } -// Pin returns the platform.File ranges currently mapped by addresses in ar in +// Pin returns the memmap.File ranges currently mapped by addresses in ar in // mm, acquiring a reference on the returned ranges which the caller must // release by calling Unpin. If not all addresses are mapped, Pin returns a // non-nil error. Note that Pin may return both a non-empty slice of @@ -674,15 +673,15 @@ type PinnedRange struct { Source usermem.AddrRange // File is the mapped file. - File platform.File + File memmap.File // Offset is the offset into File at which this PinnedRange begins. Offset uint64 } -// FileRange returns the platform.File offsets mapped by pr. -func (pr PinnedRange) FileRange() platform.FileRange { - return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} +// FileRange returns the memmap.File offsets mapped by pr. +func (pr PinnedRange) FileRange() memmap.FileRange { + return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} } // Unpin releases the reference held by prs. @@ -857,7 +856,7 @@ func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) saf } // incPrivateRef acquires a reference on private pages in fr. -func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) { +func (mm *MemoryManager) incPrivateRef(fr memmap.FileRange) { mm.privateRefs.mu.Lock() defer mm.privateRefs.mu.Unlock() refSet := &mm.privateRefs.refs @@ -878,8 +877,8 @@ func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) { } // decPrivateRef releases a reference on private pages in fr. -func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) { - var freed []platform.FileRange +func (mm *MemoryManager) decPrivateRef(fr memmap.FileRange) { + var freed []memmap.FileRange mm.privateRefs.mu.Lock() refSet := &mm.privateRefs.refs @@ -951,7 +950,7 @@ func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRa // Discard internal mappings instead of trying to merge them, since merging // them requires an allocation and getting them again from the - // platform.File might not. + // memmap.File might not. pma1.internalMappings = safemem.BlockSeq{} return pma1, true } @@ -1012,12 +1011,12 @@ func (pseg pmaIterator) getInternalMappingsLocked() error { return nil } -func (pseg pmaIterator) fileRange() platform.FileRange { +func (pseg pmaIterator) fileRange() memmap.FileRange { return pseg.fileRangeOf(pseg.Range()) } // Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0. -func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange { +func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) memmap.FileRange { if checkInvariants { if !pseg.Ok() { panic("terminal pma iterator") @@ -1032,5 +1031,5 @@ func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange { pma := pseg.ValuePtr() pstart := pseg.Start() - return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} + return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} } diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 9ad52082d..0e142fb11 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -35,7 +34,7 @@ type SpecialMappable struct { refs.AtomicRefCount mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange name string } @@ -44,7 +43,7 @@ type SpecialMappable struct { // SpecialMappable will use the given name in /proc/[pid]/maps. // // Preconditions: fr.Length() != 0. -func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable { +func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable { m := SpecialMappable{mfp: mfp, fr: fr, name: name} m.EnableLeakCheck("mm.SpecialMappable") return &m @@ -126,7 +125,7 @@ func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider { // FileRange returns the offsets into MemoryFileProvider().MemoryFile() that // store the SpecialMappable's contents. -func (m *SpecialMappable) FileRange() platform.FileRange { +func (m *SpecialMappable) FileRange() memmap.FileRange { return m.fr } diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index e1fcb175f..7a3311a70 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -36,14 +36,14 @@ go_template_instance( "trackGaps": "1", }, imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "pgalloc", prefix = "usage", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "usageInfo", "Functions": "usageSetFunctions", }, @@ -56,14 +56,14 @@ go_template_instance( "minDegree": "10", }, imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "pgalloc", prefix = "reclaim", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "reclaimSetValue", "Functions": "reclaimSetFunctions", }, @@ -89,7 +89,7 @@ go_library( "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/hostmm", - "//pkg/sentry/platform", + "//pkg/sentry/memmap", "//pkg/sentry/usage", "//pkg/state", "//pkg/state/wire", diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index afab97c0a..3243d7214 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -33,14 +33,14 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostmm" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) -// MemoryFile is a platform.File whose pages may be allocated to arbitrary +// MemoryFile is a memmap.File whose pages may be allocated to arbitrary // users. type MemoryFile struct { // opts holds options passed to NewMemoryFile. opts is immutable. @@ -372,7 +372,7 @@ func (f *MemoryFile) Destroy() { // to Allocate. // // Preconditions: length must be page-aligned and non-zero. -func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) { +func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.FileRange, error) { if length == 0 || length%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid allocation length: %#x", length)) } @@ -390,7 +390,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi // Find a range in the underlying file. fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment) if !ok { - return platform.FileRange{}, syserror.ENOMEM + return memmap.FileRange{}, syserror.ENOMEM } // Expand the file if needed. @@ -398,7 +398,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi // Round the new file size up to be chunk-aligned. newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask if err := f.file.Truncate(newFileSize); err != nil { - return platform.FileRange{}, err + return memmap.FileRange{}, err } f.fileSize = newFileSize f.mappingsMu.Lock() @@ -416,7 +416,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi bs[i] = 0 } }); err != nil { - return platform.FileRange{}, err + return memmap.FileRange{}, err } } if !f.usage.Add(fr, usageInfo{ @@ -439,7 +439,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi // space for mappings to be allocated downwards. // // Precondition: alignment must be a power of 2. -func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (platform.FileRange, bool) { +func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) { alignmentMask := alignment - 1 // Search for space in existing gaps, starting at the current end of the @@ -461,7 +461,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 break } if start := unalignedStart &^ alignmentMask; start >= gap.Start() { - return platform.FileRange{start, start + length}, true + return memmap.FileRange{start, start + length}, true } gap = gap.PrevLargeEnoughGap(length) @@ -475,7 +475,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 min = (min + alignmentMask) &^ alignmentMask if min+length < min { // Overflow: allocation would exceed the range of uint64. - return platform.FileRange{}, false + return memmap.FileRange{}, false } // Determine the minimum file size required to fit this allocation at its end. @@ -484,7 +484,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 if newFileSize <= fileSize { if fileSize != 0 { // Overflow: allocation would exceed the range of int64. - return platform.FileRange{}, false + return memmap.FileRange{}, false } newFileSize = chunkSize } @@ -496,7 +496,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 continue } if start := unalignedStart &^ alignmentMask; start >= min { - return platform.FileRange{start, start + length}, true + return memmap.FileRange{start, start + length}, true } } } @@ -508,22 +508,22 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 // by r.ReadToBlocks(), it returns that error. // // Preconditions: length > 0. length must be page-aligned. -func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (platform.FileRange, error) { +func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) { fr, err := f.Allocate(length, kind) if err != nil { - return platform.FileRange{}, err + return memmap.FileRange{}, err } dsts, err := f.MapInternal(fr, usermem.Write) if err != nil { f.DecRef(fr) - return platform.FileRange{}, err + return memmap.FileRange{}, err } n, err := safemem.ReadFullToBlocks(r, dsts) un := uint64(usermem.Addr(n).RoundDown()) if un < length { // Free unused memory and update fr to contain only the memory that is // still allocated. - f.DecRef(platform.FileRange{fr.Start + un, fr.End}) + f.DecRef(memmap.FileRange{fr.Start + un, fr.End}) fr.End = fr.Start + un } return fr, err @@ -540,7 +540,7 @@ const ( // will read zeroes. // // Preconditions: fr.Length() > 0. -func (f *MemoryFile) Decommit(fr platform.FileRange) error { +func (f *MemoryFile) Decommit(fr memmap.FileRange) error { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -560,7 +560,7 @@ func (f *MemoryFile) Decommit(fr platform.FileRange) error { return nil } -func (f *MemoryFile) markDecommitted(fr platform.FileRange) { +func (f *MemoryFile) markDecommitted(fr memmap.FileRange) { f.mu.Lock() defer f.mu.Unlock() // Since we're changing the knownCommitted attribute, we need to merge @@ -581,8 +581,8 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) { f.usage.MergeRange(fr) } -// IncRef implements platform.File.IncRef. -func (f *MemoryFile) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (f *MemoryFile) IncRef(fr memmap.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -600,8 +600,8 @@ func (f *MemoryFile) IncRef(fr platform.FileRange) { f.usage.MergeAdjacent(fr) } -// DecRef implements platform.File.DecRef. -func (f *MemoryFile) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (f *MemoryFile) DecRef(fr memmap.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -637,8 +637,8 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) { } } -// MapInternal implements platform.File.MapInternal. -func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (f *MemoryFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { if !fr.WellFormed() || fr.Length() == 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -664,7 +664,7 @@ func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) ( // forEachMappingSlice invokes fn on a sequence of byte slices that // collectively map all bytes in fr. -func (f *MemoryFile) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error { +func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error { mappings := f.mappings.Load().([]uintptr) for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { chunk := int(chunkStart >> chunkShift) @@ -944,7 +944,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( continue case !populated && populatedRun: // Finish the run by changing this segment. - runRange := platform.FileRange{ + runRange := memmap.FileRange{ Start: r.Start + uint64(populatedRunStart*usermem.PageSize), End: r.Start + uint64(i*usermem.PageSize), } @@ -1009,7 +1009,7 @@ func (f *MemoryFile) File() *os.File { return f.file } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (f *MemoryFile) FD() int { return int(f.file.Fd()) } @@ -1090,13 +1090,13 @@ func (f *MemoryFile) runReclaim() { // // Note that there returned range will be removed from tracking. It // must be reclaimed (removed from f.usage) at this point. -func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { +func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) { f.mu.Lock() defer f.mu.Unlock() for { for { if f.destroyed { - return platform.FileRange{}, false + return memmap.FileRange{}, false } if f.reclaimable { break @@ -1120,7 +1120,7 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { } } -func (f *MemoryFile) markReclaimed(fr platform.FileRange) { +func (f *MemoryFile) markReclaimed(fr memmap.FileRange) { f.mu.Lock() defer f.mu.Unlock() seg := f.usage.FindSegment(fr.Start) @@ -1222,11 +1222,11 @@ func (usageSetFunctions) MaxKey() uint64 { func (usageSetFunctions) ClearValue(val *usageInfo) { } -func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) { +func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) { return val1, val1 == val2 } -func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { +func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { return val, val } @@ -1270,10 +1270,10 @@ func (reclaimSetFunctions) MaxKey() uint64 { func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) { } -func (reclaimSetFunctions) Merge(_ platform.FileRange, _ reclaimSetValue, _ platform.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) { +func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) { return reclaimSetValue{}, true } -func (reclaimSetFunctions) Split(_ platform.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) { +func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) { return reclaimSetValue{}, reclaimSetValue{} } diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 453241eca..209b28053 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -1,39 +1,21 @@ load("//tools:defs.bzl", "go_library") -load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) -go_template_instance( - name = "file_range", - out = "file_range.go", - package = "platform", - prefix = "File", - template = "//pkg/segment:generic_range", - types = { - "T": "uint64", - }, -) - go_library( name = "platform", srcs = [ "context.go", - "file_range.go", "mmap_min_addr.go", "platform.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/atomicbitops", "//pkg/context", - "//pkg/log", - "//pkg/safecopy", - "//pkg/safemem", "//pkg/seccomp", "//pkg/sentry/arch", - "//pkg/sentry/usage", - "//pkg/syserror", + "//pkg/sentry/memmap", "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 10a10bfe2..b5d27a72a 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -47,6 +47,7 @@ go_library( "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", + "//pkg/sentry/memmap", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sentry/platform/ring0", diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index faf1d5e1c..98a3e539d 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sync" @@ -150,7 +151,7 @@ func (as *addressSpace) mapLocked(addr usermem.Addr, m hostMapEntry, at usermem. } // MapFile implements platform.AddressSpace.MapFile. -func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error { +func (as *addressSpace) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error { as.mu.Lock() defer as.mu.Unlock() diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index 171513f3f..4b13eec30 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -22,9 +22,9 @@ import ( "os" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/usermem" ) @@ -207,7 +207,7 @@ type AddressSpace interface { // Preconditions: addr and fr must be page-aligned. fr.Length() > 0. // at.Any() == true. At least one reference must be held on all pages in // fr, and must continue to be held as long as pages are mapped. - MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, precommit bool) error + MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error // Unmap unmaps the given range. // @@ -310,52 +310,6 @@ func (f SegmentationFault) Error() string { return fmt.Sprintf("segmentation fault at %#x", f.Addr) } -// File represents a host file that may be mapped into an AddressSpace. -type File interface { - // All pages in a File are reference-counted. - - // IncRef increments the reference count on all pages in fr. - // - // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > - // 0. At least one reference must be held on all pages in fr. (The File - // interface does not provide a way to acquire an initial reference; - // implementors may define mechanisms for doing so.) - IncRef(fr FileRange) - - // DecRef decrements the reference count on all pages in fr. - // - // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > - // 0. At least one reference must be held on all pages in fr. - DecRef(fr FileRange) - - // MapInternal returns a mapping of the given file offsets in the invoking - // process' address space for reading and writing. - // - // Note that fr.Start and fr.End need not be page-aligned. - // - // Preconditions: fr.Length() > 0. At least one reference must be held on - // all pages in fr. - // - // Postconditions: The returned mapping is valid as long as at least one - // reference is held on the mapped pages. - MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error) - - // FD returns the file descriptor represented by the File. - // - // The only permitted operation on the returned file descriptor is to map - // pages from it consistent with the requirements of AddressSpace.MapFile. - FD() int -} - -// FileRange represents a range of uint64 offsets into a File. -// -// type FileRange <generated using go_generics> - -// String implements fmt.Stringer.String. -func (fr FileRange) String() string { - return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) -} - // Requirements is used to specify platform specific requirements. type Requirements struct { // RequiresCurrentPIDNS indicates that the sandbox has to be started in the diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 30402c2df..29fd23cc3 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/hostcpu", + "//pkg/sentry/memmap", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sync", diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 2389423b0..c990f3454 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -616,7 +617,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp } // MapFile implements platform.AddressSpace.MapFile. -func (s *subprocess) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error { +func (s *subprocess) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error { var flags int if precommit { flags |= syscall.MAP_POPULATE diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go index ccacaea6b..fca3a5478 100644 --- a/pkg/sentry/platform/ring0/kernel_arm64.go +++ b/pkg/sentry/platform/ring0/kernel_arm64.go @@ -58,7 +58,13 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { regs.Pstate &= ^uint64(UserFlagsClear) regs.Pstate |= UserFlagsSet + + SetTLS(regs.TPIDR_EL0) + kernelExitToEl0() + + regs.TPIDR_EL0 = GetTLS() + vector = c.vecCode // Perform the switch. diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index c40c6d673..c0fd3425b 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -20,5 +20,6 @@ go_library( "//pkg/syserr", "//pkg/tcpip", "//pkg/usermem", + "//tools/go_marshal/marshal", ], ) diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index ff81ea6e6..e76e498de 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -40,6 +40,8 @@ go_library( "//pkg/tcpip/stack", "//pkg/usermem", "//pkg/waiter", + "//tools/go_marshal/marshal", + "//tools/go_marshal/primitive", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index a92aed2c9..532a1ea5d 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -36,6 +36,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" + "gvisor.dev/gvisor/tools/go_marshal/primitive" ) const ( @@ -319,7 +321,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { if outLen < 0 { return nil, syserr.ErrInvalidArgument } @@ -364,7 +366,8 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr if err != nil { return nil, syserr.FromError(err) } - return opt, nil + optP := primitive.ByteSlice(opt) + return &optP, nil } // SetSockOpt implements socket.Socket.SetSockOpt. @@ -708,6 +711,6 @@ func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int func init() { for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} { socket.RegisterProvider(family, &socketProvider{family}) - socket.RegisterProviderVFS2(family, &socketProviderVFS2{}) + socket.RegisterProviderVFS2(family, &socketProviderVFS2{family}) } } diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index f7abe77d3..a9f0604ae 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -66,7 +66,7 @@ func nflog(format string, args ...interface{}) { func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) { // Read in the struct and table name. var info linux.IPTGetinfo - if _, err := t.CopyIn(outPtr, &info); err != nil { + if _, err := info.CopyIn(t, outPtr); err != nil { return linux.IPTGetinfo{}, syserr.FromError(err) } @@ -84,7 +84,7 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) { // Read in the struct and table name. var userEntries linux.IPTGetEntries - if _, err := t.CopyIn(outPtr, &userEntries); err != nil { + if _, err := userEntries.CopyIn(t, outPtr); err != nil { nflog("couldn't copy in entries %q", userEntries.Name) return linux.KernelIPTGetEntries{}, syserr.FromError(err) } @@ -145,7 +145,7 @@ func convertNetstackToBinary(stack *stack.Stack, tablename linux.TableName) (lin // Each rule corresponds to an entry. entry := linux.KernelIPTEntry{ - IPTEntry: linux.IPTEntry{ + Entry: linux.IPTEntry{ IP: linux.IPTIP{ Protocol: uint16(rule.Filter.Protocol), }, @@ -153,20 +153,20 @@ func convertNetstackToBinary(stack *stack.Stack, tablename linux.TableName) (lin TargetOffset: linux.SizeOfIPTEntry, }, } - copy(entry.IPTEntry.IP.Dst[:], rule.Filter.Dst) - copy(entry.IPTEntry.IP.DstMask[:], rule.Filter.DstMask) - copy(entry.IPTEntry.IP.Src[:], rule.Filter.Src) - copy(entry.IPTEntry.IP.SrcMask[:], rule.Filter.SrcMask) - copy(entry.IPTEntry.IP.OutputInterface[:], rule.Filter.OutputInterface) - copy(entry.IPTEntry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) + copy(entry.Entry.IP.Dst[:], rule.Filter.Dst) + copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask) + copy(entry.Entry.IP.Src[:], rule.Filter.Src) + copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask) + copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface) + copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) if rule.Filter.DstInvert { - entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_DSTIP + entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP } if rule.Filter.SrcInvert { - entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_SRCIP + entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP } if rule.Filter.OutputInterfaceInvert { - entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT + entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT } for _, matcher := range rule.Matchers { @@ -178,8 +178,8 @@ func convertNetstackToBinary(stack *stack.Stack, tablename linux.TableName) (lin panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher)) } entry.Elems = append(entry.Elems, serialized...) - entry.NextOffset += uint16(len(serialized)) - entry.TargetOffset += uint16(len(serialized)) + entry.Entry.NextOffset += uint16(len(serialized)) + entry.Entry.TargetOffset += uint16(len(serialized)) } // Serialize and append the target. @@ -188,11 +188,11 @@ func convertNetstackToBinary(stack *stack.Stack, tablename linux.TableName) (lin panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target)) } entry.Elems = append(entry.Elems, serialized...) - entry.NextOffset += uint16(len(serialized)) + entry.Entry.NextOffset += uint16(len(serialized)) nflog("convert to binary: adding entry: %+v", entry) - entries.Size += uint32(entry.NextOffset) + entries.Size += uint32(entry.Entry.NextOffset) entries.Entrytable = append(entries.Entrytable, entry) info.NumEntries++ } @@ -342,10 +342,10 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { // TODO(gvisor.dev/issue/170): Support other tables. var table stack.Table switch replace.Name.String() { - case stack.TablenameFilter: + case stack.FilterTable: table = stack.EmptyFilterTable() - case stack.TablenameNat: - table = stack.EmptyNatTable() + case stack.NATTable: + table = stack.EmptyNATTable() default: nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) return syserr.ErrInvalidArgument @@ -431,6 +431,8 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { for hook, _ := range replace.HookEntry { if table.ValidHooks()&(1<<hook) != 0 { hk := hookFromLinux(hook) + table.BuiltinChains[hk] = stack.HookUnset + table.Underflows[hk] = stack.HookUnset for offset, ruleIdx := range offsets { if offset == replace.HookEntry[hook] { table.BuiltinChains[hk] = ruleIdx @@ -456,8 +458,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { // Add the user chains. for ruleIdx, rule := range table.Rules { - target, ok := rule.Target.(stack.UserChainTarget) - if !ok { + if _, ok := rule.Target.(stack.UserChainTarget); !ok { continue } @@ -473,7 +474,6 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { nflog("user chain's first node must have no matchers") return syserr.ErrInvalidArgument } - table.UserChains[target.Name] = ruleIdx + 1 } // Set each jump to point to the appropriate rule. Right now they hold byte @@ -499,7 +499,10 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { // Since we only support modifying the INPUT, PREROUTING and OUTPUT chain right now, // make sure all other chains point to ACCEPT rules. for hook, ruleIdx := range table.BuiltinChains { - if hook == stack.Forward || hook == stack.Postrouting { + if hook := stack.Hook(hook); hook == stack.Forward || hook == stack.Postrouting { + if ruleIdx == stack.HookUnset { + continue + } if !isUnconditionalAccept(table.Rules[ruleIdx]) { nflog("hook %d is unsupported.", hook) return syserr.ErrInvalidArgument @@ -512,9 +515,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { // - There are no chains without an unconditional final rule. // - There are no chains without an unconditional underflow rule. - stk.IPTables().ReplaceTable(replace.Name.String(), table) - - return nil + return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table)) } // parseMatchers parses 0 or more matchers from optVal. optVal should contain diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index d5ca3ac56..0546801bf 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -36,6 +36,8 @@ go_library( "//pkg/tcpip", "//pkg/usermem", "//pkg/waiter", + "//tools/go_marshal/marshal", + "//tools/go_marshal/primitive", ], ) diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 81f34c5a2..98ca7add0 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -38,6 +38,8 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" + "gvisor.dev/gvisor/tools/go_marshal/primitive" ) const sizeOfInt32 int = 4 @@ -330,7 +332,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { switch level { case linux.SOL_SOCKET: switch name { @@ -340,24 +342,26 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr } s.mu.Lock() defer s.mu.Unlock() - return int32(s.sendBufferSize), nil + sendBufferSizeP := primitive.Int32(s.sendBufferSize) + return &sendBufferSizeP, nil case linux.SO_RCVBUF: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } // We don't have limit on receiving size. - return int32(math.MaxInt32), nil + recvBufferSizeP := primitive.Int32(math.MaxInt32) + return &recvBufferSizeP, nil case linux.SO_PASSCRED: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - var passcred int32 + var passcred primitive.Int32 if s.Passcred() { passcred = 1 } - return passcred, nil + return &passcred, nil default: socket.GetSockOptEmitUnimplementedEvent(t, name) diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index ea6ebd0e2..1fb777a6c 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -51,6 +51,8 @@ go_library( "//pkg/tcpip/transport/udp", "//pkg/usermem", "//pkg/waiter", + "//tools/go_marshal/marshal", + "//tools/go_marshal/primitive", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 49a04e613..44b3fff46 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -26,6 +26,7 @@ package netstack import ( "bytes" + "fmt" "io" "math" "reflect" @@ -61,6 +62,8 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" + "gvisor.dev/gvisor/tools/go_marshal/primitive" ) func mustCreateMetric(name, description string) *tcpip.StatCounter { @@ -909,7 +912,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is // implemented specifically for netstack.SocketOperations rather than // commonEndpoint. commonEndpoint should be extended to support socket @@ -919,25 +922,25 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - val := int32(0) + val := primitive.Int32(0) s.readMu.Lock() defer s.readMu.Unlock() if s.sockOptTimestamp { val = 1 } - return val, nil + return &val, nil } if level == linux.SOL_TCP && name == linux.TCP_INQ { if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - val := int32(0) + val := primitive.Int32(0) s.readMu.Lock() defer s.readMu.Unlock() if s.sockOptInq { val = 1 } - return val, nil + return &val, nil } if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { @@ -955,7 +958,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us if err != nil { return nil, err } - return info, nil + return &info, nil case linux.IPT_SO_GET_ENTRIES: if outLen < linux.SizeOfIPTGetEntries { @@ -970,7 +973,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us if err != nil { return nil, err } - return entries, nil + return &entries, nil } } @@ -980,7 +983,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us // GetSockOpt can be used to implement the linux syscall getsockopt(2) for // sockets backed by a commonEndpoint. -func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) { +func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (marshal.Marshallable, *syserr.Error) { switch level { case linux.SOL_SOCKET: return getSockOptSocket(t, s, ep, family, skType, name, outLen) @@ -1013,7 +1016,7 @@ func boolToInt32(v bool) int32 { } // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. -func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) { // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. switch name { case linux.SO_ERROR: @@ -1024,9 +1027,12 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam // Get the last error and convert it. err := ep.GetSockOpt(tcpip.ErrorOption{}) if err == nil { - return int32(0), nil + optP := primitive.Int32(0) + return &optP, nil } - return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil + + optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux().Number()) + return &optP, nil case linux.SO_PEERCRED: if family != linux.AF_UNIX || outLen < syscall.SizeofUcred { @@ -1034,11 +1040,12 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam } tcred := t.Credentials() - return syscall.Ucred{ - Pid: int32(t.ThreadGroup().ID()), - Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()), - Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()), - }, nil + creds := linux.ControlMessageCredentials{ + PID: int32(t.ThreadGroup().ID()), + UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()), + GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()), + } + return &creds, nil case linux.SO_PASSCRED: if outLen < sizeOfInt32 { @@ -1049,7 +1056,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.SO_SNDBUF: if outLen < sizeOfInt32 { @@ -1065,7 +1074,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam size = math.MaxInt32 } - return int32(size), nil + sizeP := primitive.Int32(size) + return &sizeP, nil case linux.SO_RCVBUF: if outLen < sizeOfInt32 { @@ -1081,7 +1091,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam size = math.MaxInt32 } - return int32(size), nil + sizeP := primitive.Int32(size) + return &sizeP, nil case linux.SO_REUSEADDR: if outLen < sizeOfInt32 { @@ -1092,7 +1103,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.SO_REUSEPORT: if outLen < sizeOfInt32 { @@ -1103,7 +1115,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.SO_BINDTODEVICE: var v tcpip.BindToDeviceOption @@ -1111,7 +1125,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.TranslateNetstackError(err) } if v == 0 { - return []byte{}, nil + var b primitive.ByteSlice + return &b, nil } if outLen < linux.IFNAMSIZ { return nil, syserr.ErrInvalidArgument @@ -1126,7 +1141,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam // interface was removed. return nil, syserr.ErrUnknownDevice } - return append([]byte(nic.Name), 0), nil + + name := primitive.ByteSlice(append([]byte(nic.Name), 0)) + return &name, nil case linux.SO_BROADCAST: if outLen < sizeOfInt32 { @@ -1137,7 +1154,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.SO_KEEPALIVE: if outLen < sizeOfInt32 { @@ -1148,13 +1167,17 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.SO_LINGER: if outLen < linux.SizeOfLinger { return nil, syserr.ErrInvalidArgument } - return linux.Linger{}, nil + + linger := linux.Linger{} + return &linger, nil case linux.SO_SNDTIMEO: // TODO(igudger): Linux allows shorter lengths for partial results. @@ -1162,7 +1185,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - return linux.NsecToTimeval(s.SendTimeout()), nil + sendTimeout := linux.NsecToTimeval(s.SendTimeout()) + return &sendTimeout, nil case linux.SO_RCVTIMEO: // TODO(igudger): Linux allows shorter lengths for partial results. @@ -1170,7 +1194,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - return linux.NsecToTimeval(s.RecvTimeout()), nil + recvTimeout := linux.NsecToTimeval(s.RecvTimeout()) + return &recvTimeout, nil case linux.SO_OOBINLINE: if outLen < sizeOfInt32 { @@ -1182,7 +1207,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.TranslateNetstackError(err) } - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil case linux.SO_NO_CHECK: if outLen < sizeOfInt32 { @@ -1193,7 +1219,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil default: socket.GetSockOptEmitUnimplementedEvent(t, name) @@ -1202,7 +1229,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam } // getSockOptTCP implements GetSockOpt when level is SOL_TCP. -func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) { switch name { case linux.TCP_NODELAY: if outLen < sizeOfInt32 { @@ -1213,7 +1240,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(!v), nil + + vP := primitive.Int32(boolToInt32(!v)) + return &vP, nil case linux.TCP_CORK: if outLen < sizeOfInt32 { @@ -1224,7 +1253,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.TCP_QUICKACK: if outLen < sizeOfInt32 { @@ -1235,7 +1266,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.TCP_MAXSEG: if outLen < sizeOfInt32 { @@ -1246,8 +1279,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil case linux.TCP_KEEPIDLE: if outLen < sizeOfInt32 { @@ -1258,8 +1291,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(time.Duration(v) / time.Second), nil + keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second) + return &keepAliveIdle, nil case linux.TCP_KEEPINTVL: if outLen < sizeOfInt32 { @@ -1270,8 +1303,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(time.Duration(v) / time.Second), nil + keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second) + return &keepAliveInterval, nil case linux.TCP_KEEPCNT: if outLen < sizeOfInt32 { @@ -1282,8 +1315,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil case linux.TCP_USER_TIMEOUT: if outLen < sizeOfInt32 { @@ -1294,8 +1327,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(time.Duration(v) / time.Millisecond), nil + tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond) + return &tcpUserTimeout, nil case linux.TCP_INFO: var v tcpip.TCPInfoOption @@ -1308,12 +1341,13 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa info := linux.TCPInfo{} // Linux truncates the output binary to outLen. - ib := binary.Marshal(nil, usermem.ByteOrder, &info) - if len(ib) > outLen { - ib = ib[:outLen] + buf := t.CopyScratchBuffer(info.SizeBytes()) + info.MarshalUnsafe(buf) + if len(buf) > outLen { + buf = buf[:outLen] } - - return ib, nil + bufP := primitive.ByteSlice(buf) + return &bufP, nil case linux.TCP_CC_INFO, linux.TCP_NOTSENT_LOWAT, @@ -1343,7 +1377,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa } b := make([]byte, toCopy) copy(b, v) - return b, nil + + bP := primitive.ByteSlice(b) + return &bP, nil case linux.TCP_LINGER2: if outLen < sizeOfInt32 { @@ -1355,7 +1391,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return nil, syserr.TranslateNetstackError(err) } - return int32(time.Duration(v) / time.Second), nil + lingerTimeout := primitive.Int32(time.Duration(v) / time.Second) + return &lingerTimeout, nil case linux.TCP_DEFER_ACCEPT: if outLen < sizeOfInt32 { @@ -1367,7 +1404,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return nil, syserr.TranslateNetstackError(err) } - return int32(time.Duration(v) / time.Second), nil + tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second) + return &tcpDeferAccept, nil case linux.TCP_SYNCNT: if outLen < sizeOfInt32 { @@ -1378,8 +1416,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil case linux.TCP_WINDOW_CLAMP: if outLen < sizeOfInt32 { @@ -1390,8 +1428,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil default: emitUnimplementedEventTCP(t, name) } @@ -1399,7 +1437,7 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa } // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6. -func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) { switch name { case linux.IPV6_V6ONLY: if outLen < sizeOfInt32 { @@ -1410,7 +1448,9 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.IPV6_PATHMTU: t.Kernel().EmitUnimplementedEvent(t) @@ -1418,21 +1458,24 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf case linux.IPV6_TCLASS: // Length handling for parity with Linux. if outLen == 0 { - return make([]byte, 0), nil + var b primitive.ByteSlice + return &b, nil } v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } - uintv := uint32(v) + uintv := primitive.Uint32(v) // Linux truncates the output binary to outLen. - ib := binary.Marshal(nil, usermem.ByteOrder, &uintv) + ib := t.CopyScratchBuffer(uintv.SizeBytes()) + uintv.MarshalUnsafe(ib) // Handle cases where outLen is lesser than sizeOfInt32. if len(ib) > outLen { ib = ib[:outLen] } - return ib, nil + ibP := primitive.ByteSlice(ib) + return &ibP, nil case linux.IPV6_RECVTCLASS: if outLen < sizeOfInt32 { @@ -1443,7 +1486,9 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil default: emitUnimplementedEventIPv6(t, name) @@ -1452,7 +1497,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf } // getSockOptIP implements GetSockOpt when level is SOL_IP. -func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (interface{}, *syserr.Error) { +func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (marshal.Marshallable, *syserr.Error) { switch name { case linux.IP_TTL: if outLen < sizeOfInt32 { @@ -1465,11 +1510,12 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in } // Fill in the default value, if needed. - if v == 0 { - v = DefaultTTL + vP := primitive.Int32(v) + if vP == 0 { + vP = DefaultTTL } - return int32(v), nil + return &vP, nil case linux.IP_MULTICAST_TTL: if outLen < sizeOfInt32 { @@ -1481,7 +1527,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in return nil, syserr.TranslateNetstackError(err) } - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil case linux.IP_MULTICAST_IF: if outLen < len(linux.InetAddr{}) { @@ -1495,7 +1542,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr}) - return a.(*linux.SockAddrInet).Addr, nil + return &a.(*linux.SockAddrInet).Addr, nil case linux.IP_MULTICAST_LOOP: if outLen < sizeOfInt32 { @@ -1506,21 +1553,26 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.IP_TOS: // Length handling for parity with Linux. if outLen == 0 { - return []byte(nil), nil + var b primitive.ByteSlice + return &b, nil } v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } if outLen < sizeOfInt32 { - return uint8(v), nil + vP := primitive.Uint8(v) + return &vP, nil } - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil case linux.IP_RECVTOS: if outLen < sizeOfInt32 { @@ -1531,7 +1583,9 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.IP_PKTINFO: if outLen < sizeOfInt32 { @@ -1542,7 +1596,9 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil default: emitUnimplementedEventIP(t, name) @@ -2468,6 +2524,23 @@ func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) { cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed) } +func toLinuxPacketType(pktType tcpip.PacketType) uint8 { + switch pktType { + case tcpip.PacketHost: + return linux.PACKET_HOST + case tcpip.PacketOtherHost: + return linux.PACKET_OTHERHOST + case tcpip.PacketOutgoing: + return linux.PACKET_OUTGOING + case tcpip.PacketBroadcast: + return linux.PACKET_BROADCAST + case tcpip.PacketMulticast: + return linux.PACKET_MULTICAST + default: + panic(fmt.Sprintf("unknown packet type: %d", pktType)) + } +} + // nonBlockingRead issues a non-blocking read. // // TODO(b/78348848): Support timestamps for stream sockets. @@ -2526,6 +2599,7 @@ func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSeq switch v := addr.(type) { case *linux.SockAddrLink: v.Protocol = htons(uint16(s.linkPacketInfo.Protocol)) + v.PacketType = toLinuxPacketType(s.linkPacketInfo.PktType) } } @@ -2761,6 +2835,11 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, } func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + panic("ioctl(2) may only be called from a task goroutine") + } + // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint // sockets. // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. @@ -2773,9 +2852,7 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy } tv := linux.NsecToTimeval(s.timestampNS) - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tv, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := tv.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCINQ: @@ -2794,9 +2871,8 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy } // Copy result to userspace. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ - AddressSpaceActive: true, - }) + vP := primitive.Int32(v) + _, err := vP.CopyOut(t, args[2].Pointer()) return 0, err } @@ -2805,6 +2881,11 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy // Ioctl performs a socket ioctl. func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + panic("ioctl(2) may only be called from a task goroutine") + } + switch arg := int(args[1].Int()); arg { case linux.SIOCGIFFLAGS, linux.SIOCGIFADDR, @@ -2821,37 +2902,28 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc linux.SIOCETHTOOL: var ifr linux.IFReq - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil { return 0, err } if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil { return 0, err.ToError() } - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := ifr.CopyOut(t, args[2].Pointer()) return 0, err case linux.SIOCGIFCONF: // Return a list of interface addresses or the buffer size // necessary to hold the list. var ifc linux.IFConf - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil { return 0, err } - if err := ifconfIoctl(ctx, io, &ifc); err != nil { + if err := ifconfIoctl(ctx, t, io, &ifc); err != nil { return 0, err } - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{ - AddressSpaceActive: true, - }) - + _, err := ifc.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCINQ: @@ -2864,9 +2936,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc v = math.MaxInt32 } // Copy result to userspace. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ - AddressSpaceActive: true, - }) + vP := primitive.Int32(v) + _, err := vP.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCOUTQ: @@ -2880,9 +2951,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc } // Copy result to userspace. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ - AddressSpaceActive: true, - }) + vP := primitive.Int32(v) + _, err := vP.CopyOut(t, args[2].Pointer()) return 0, err case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG: @@ -3031,7 +3101,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe } // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl. -func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error { +func ifconfIoctl(ctx context.Context, t *kernel.Task, io usermem.IO, ifc *linux.IFConf) error { // If Ptr is NULL, return the necessary buffer size via Len. // Otherwise, write up to Len bytes starting at Ptr containing ifreq // structs. @@ -3068,9 +3138,7 @@ func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error { // Copy the ifr to userspace. dst := uintptr(ifc.Ptr) + uintptr(ifc.Len) ifc.Len += int32(linux.SizeOfIFReq) - if _, err := usermem.CopyObjectOut(ctx, io, usermem.Addr(dst), ifr, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + if _, err := ifr.CopyOut(t, usermem.Addr(dst)); err != nil { return err } } diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index d65a89316..a9025b0ec 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -31,6 +31,8 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" + "gvisor.dev/gvisor/tools/go_marshal/primitive" ) // SocketVFS2 encapsulates all the state needed to represent a network stack @@ -200,7 +202,7 @@ func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is // implemented specifically for netstack.SocketVFS2 rather than // commonEndpoint. commonEndpoint should be extended to support socket @@ -210,25 +212,25 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem. if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - val := int32(0) + val := primitive.Int32(0) s.readMu.Lock() defer s.readMu.Unlock() if s.sockOptTimestamp { val = 1 } - return val, nil + return &val, nil } if level == linux.SOL_TCP && name == linux.TCP_INQ { if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - val := int32(0) + val := primitive.Int32(0) s.readMu.Lock() defer s.readMu.Unlock() if s.sockOptInq { val = 1 } - return val, nil + return &val, nil } if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { @@ -246,7 +248,7 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem. if err != nil { return nil, err } - return info, nil + return &info, nil case linux.IPT_SO_GET_ENTRIES: if outLen < linux.SizeOfIPTGetEntries { @@ -261,7 +263,7 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem. if err != nil { return nil, err } - return entries, nil + return &entries, nil } } diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index fcd7f9d7f..d112757fb 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -35,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/tools/go_marshal/marshal" ) // ControlMessages represents the union of unix control messages and tcpip @@ -86,7 +87,7 @@ type SocketOps interface { Shutdown(t *kernel.Task, how int) *syserr.Error // GetSockOpt implements the getsockopt(2) linux syscall. - GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) + GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) // SetSockOpt implements the setsockopt(2) linux syscall. SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index cca5e70f1..061a689a9 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -35,5 +35,6 @@ go_library( "//pkg/tcpip", "//pkg/usermem", "//pkg/waiter", + "//tools/go_marshal/marshal", ], ) diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 4bb2b6ff4..0482d33cf 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -40,6 +40,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" ) // SocketOperations is a Unix socket. It is similar to a netstack socket, @@ -184,7 +185,7 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen) } diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index ff2149250..05c16fcfe 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -32,6 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" ) // SocketVFS2 implements socket.SocketVFS2 (and by extension, @@ -89,7 +90,7 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by // a transport.Endpoint. -func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen) } diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 217fcfef2..4a9b04fd0 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -99,5 +99,7 @@ go_library( "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", + "//tools/go_marshal/marshal", + "//tools/go_marshal/primitive", ], ) diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index ea4f9b1a7..80c65164a 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -325,8 +325,8 @@ var AMD64 = &kernel.SyscallTable{ 270: syscalls.Supported("pselect", Pselect), 271: syscalls.Supported("ppoll", Ppoll), 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), - 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 273: syscalls.Supported("set_robust_list", SetRobustList), + 274: syscalls.Supported("get_robust_list", GetRobustList), 275: syscalls.Supported("splice", Splice), 276: syscalls.Supported("tee", Tee), 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index b68261f72..f04d78856 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -198,7 +198,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch cmd { case linux.FUTEX_WAIT: // WAIT uses a relative timeout. - mask = ^uint32(0) + mask = linux.FUTEX_BITSET_MATCH_ANY var timeoutDur time.Duration if !forever { timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond @@ -286,3 +286,49 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, syserror.ENOSYS } } + +// SetRobustList implements linux syscall set_robust_list(2). +func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // Despite the syscall using the name 'pid' for this variable, it is + // very much a tid. + head := args[0].Pointer() + length := args[1].SizeT() + + if length != uint(linux.SizeOfRobustListHead) { + return 0, nil, syserror.EINVAL + } + t.SetRobustList(head) + return 0, nil, nil +} + +// GetRobustList implements linux syscall get_robust_list(2). +func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // Despite the syscall using the name 'pid' for this variable, it is + // very much a tid. + tid := args[0].Int() + head := args[1].Pointer() + size := args[2].Pointer() + + if tid < 0 { + return 0, nil, syserror.EINVAL + } + + ot := t + if tid != 0 { + if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil { + return 0, nil, syserror.ESRCH + } + } + + // Copy out head pointer. + if _, err := t.CopyOut(head, uint64(ot.GetRobustList())); err != nil { + return 0, nil, err + } + + // Copy out size, which is a constant. + if _, err := t.CopyOut(size, uint64(linux.SizeOfRobustListHead)); err != nil { + return 0, nil, err + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 0760af77b..414fce8e3 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -29,6 +29,8 @@ import ( "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/tools/go_marshal/marshal" + "gvisor.dev/gvisor/tools/go_marshal/primitive" ) // LINT.IfChange @@ -474,7 +476,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } if v != nil { - if _, err := t.CopyOut(optValAddr, v); err != nil { + if _, err := v.CopyOut(t, optValAddr); err != nil { return 0, nil, err } } @@ -484,7 +486,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // getSockOpt tries to handle common socket options, or dispatches to a specific // socket implementation. -func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) { +func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (marshal.Marshallable, *syserr.Error) { if level == linux.SOL_SOCKET { switch name { case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL: @@ -496,13 +498,16 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr use switch name { case linux.SO_TYPE: _, skType, _ := s.Type() - return int32(skType), nil + v := primitive.Int32(skType) + return &v, nil case linux.SO_DOMAIN: family, _, _ := s.Type() - return int32(family), nil + v := primitive.Int32(family) + return &v, nil case linux.SO_PROTOCOL: _, _, protocol := s.Type() - return int32(protocol), nil + v := primitive.Int32(protocol) + return &v, nil } } @@ -539,7 +544,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, syserror.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) - if _, err := t.CopyIn(optValAddr, &buf); err != nil { + if _, err := t.CopyInBytes(optValAddr, buf); err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 0c740335b..64696b438 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -72,5 +72,7 @@ go_library( "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", + "//tools/go_marshal/marshal", + "//tools/go_marshal/primitive", ], ) diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go index adeaa39cc..ea337de7c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mount.go +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -77,8 +77,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Silently allow MS_NOSUID, since we don't implement set-id bits // anyway. - const unsupportedFlags = linux.MS_NODEV | - linux.MS_NODIRATIME | linux.MS_STRICTATIME + const unsupportedFlags = linux.MS_NODIRATIME | linux.MS_STRICTATIME // Linux just allows passing any flags to mount(2) - it won't fail when // unknown or unsupported flags are passed. Since we don't implement @@ -94,6 +93,12 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if flags&linux.MS_NOEXEC == linux.MS_NOEXEC { opts.Flags.NoExec = true } + if flags&linux.MS_NODEV == linux.MS_NODEV { + opts.Flags.NoDev = true + } + if flags&linux.MS_NOSUID == linux.MS_NOSUID { + opts.Flags.NoSUID = true + } if flags&linux.MS_RDONLY == linux.MS_RDONLY { opts.ReadOnly = true } diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 09ecfed26..6daedd173 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -178,6 +178,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc Mask: linux.STATX_SIZE, Size: uint64(length), }, + NeedWritePerm: true, }) return 0, nil, handleSetSizeError(t, err) } @@ -197,6 +198,10 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } defer file.DecRef() + if !file.IsWritable() { + return 0, nil, syserror.EINVAL + } + err := file.SetStat(t, vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_SIZE, diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index 10b668477..8096a8f9c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -30,6 +30,8 @@ import ( "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/tools/go_marshal/marshal" + "gvisor.dev/gvisor/tools/go_marshal/primitive" ) // minListenBacklog is the minimum reasonable backlog for listening sockets. @@ -477,7 +479,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } if v != nil { - if _, err := t.CopyOut(optValAddr, v); err != nil { + if _, err := v.CopyOut(t, optValAddr); err != nil { return 0, nil, err } } @@ -487,7 +489,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // getSockOpt tries to handle common socket options, or dispatches to a specific // socket implementation. -func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) { +func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (marshal.Marshallable, *syserr.Error) { if level == linux.SOL_SOCKET { switch name { case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL: @@ -499,13 +501,16 @@ func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr switch name { case linux.SO_TYPE: _, skType, _ := s.Type() - return int32(skType), nil + v := primitive.Int32(skType) + return &v, nil case linux.SO_DOMAIN: family, _, _ := s.Type() - return int32(family), nil + v := primitive.Int32(family) + return &v, nil case linux.SO_PROTOCOL: _, _, protocol := s.Type() - return int32(protocol), nil + v := primitive.Int32(protocol) + return &v, nil } } @@ -542,7 +547,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, syserror.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) - if _, err := t.CopyIn(optValAddr, &buf); err != nil { + if _, err := t.CopyInBytes(optValAddr, buf); err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index 945a364a7..63ab11f8c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -15,12 +15,15 @@ package vfs2 import ( + "io" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -110,16 +113,20 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Move data. var ( - n int64 - err error - inCh chan struct{} - outCh chan struct{} + n int64 + err error ) + dw := dualWaiter{ + inFile: inFile, + outFile: outFile, + } + defer dw.destroy() for { // If both input and output are pipes, delegate to the pipe - // implementation. Otherwise, exactly one end is a pipe, which we - // ensure is consistently ordered after the non-pipe FD's locks by - // passing the pipe FD as usermem.IO to the non-pipe end. + // implementation. Otherwise, exactly one end is a pipe, which + // we ensure is consistently ordered after the non-pipe FD's + // locks by passing the pipe FD as usermem.IO to the non-pipe + // end. switch { case inIsPipe && outIsPipe: n, err = pipe.Splice(t, outPipeFD, inPipeFD, count) @@ -137,38 +144,15 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } else { n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{}) } + default: + panic("not possible") } + if n != 0 || err != syserror.ErrWouldBlock || nonBlock { break } - - // Note that the blocking behavior here is a bit different than the - // normal pattern. Because we need to have both data to read and data - // to write simultaneously, we actually explicitly block on both of - // these cases in turn before returning to the splice operation. - if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { - if inCh == nil { - inCh = make(chan struct{}, 1) - inW, _ := waiter.NewChannelEntry(inCh) - inFile.EventRegister(&inW, eventMaskRead) - defer inFile.EventUnregister(&inW) - continue // Need to refresh readiness. - } - if err = t.Block(inCh); err != nil { - break - } - } - if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { - if outCh == nil { - outCh = make(chan struct{}, 1) - outW, _ := waiter.NewChannelEntry(outCh) - outFile.EventRegister(&outW, eventMaskWrite) - defer outFile.EventUnregister(&outW) - continue // Need to refresh readiness. - } - if err = t.Block(outCh); err != nil { - break - } + if err = dw.waitForBoth(t); err != nil { + break } } @@ -247,45 +231,256 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo // Copy data. var ( - inCh chan struct{} - outCh chan struct{} + n int64 + err error ) + dw := dualWaiter{ + inFile: inFile, + outFile: outFile, + } + defer dw.destroy() for { - n, err := pipe.Tee(t, outPipeFD, inPipeFD, count) - if n != 0 { - return uintptr(n), nil, nil + n, err = pipe.Tee(t, outPipeFD, inPipeFD, count) + if n != 0 || err != syserror.ErrWouldBlock || nonBlock { + break + } + if err = dw.waitForBoth(t); err != nil { + break + } + } + if n == 0 { + return 0, nil, err + } + outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + return uintptr(n), nil, nil +} + +// Sendfile implements linux system call sendfile(2). +func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + outFD := args[0].Int() + inFD := args[1].Int() + offsetAddr := args[2].Pointer() + count := int64(args[3].SizeT()) + + inFile := t.GetFileVFS2(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + if !inFile.IsReadable() { + return 0, nil, syserror.EBADF + } + + outFile := t.GetFileVFS2(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + if !outFile.IsWritable() { + return 0, nil, syserror.EBADF + } + + // Verify that the outFile Append flag is not set. + if outFile.StatusFlags()&linux.O_APPEND != 0 { + return 0, nil, syserror.EINVAL + } + + // Verify that inFile is a regular file or block device. This is a + // requirement; the same check appears in Linux + // (fs/splice.c:splice_direct_to_actor). + if stat, err := inFile.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}); err != nil { + return 0, nil, err + } else if stat.Mask&linux.STATX_TYPE == 0 || + (stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) { + return 0, nil, syserror.EINVAL + } + + // Copy offset if it exists. + offset := int64(-1) + if offsetAddr != 0 { + if inFile.Options().DenyPRead { + return 0, nil, syserror.ESPIPE } - if err != syserror.ErrWouldBlock || nonBlock { + if _, err := t.CopyIn(offsetAddr, &offset); err != nil { return 0, nil, err } + if offset < 0 { + return 0, nil, syserror.EINVAL + } + if offset+count < 0 { + return 0, nil, syserror.EINVAL + } + } + + // Validate count. This must come after offset checks. + if count < 0 { + return 0, nil, syserror.EINVAL + } + if count == 0 { + return 0, nil, nil + } + if count > int64(kernel.MAX_RW_COUNT) { + count = int64(kernel.MAX_RW_COUNT) + } - // Note that the blocking behavior here is a bit different than the - // normal pattern. Because we need to have both data to read and data - // to write simultaneously, we actually explicitly block on both of - // these cases in turn before returning to the tee operation. - if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { - if inCh == nil { - inCh = make(chan struct{}, 1) - inW, _ := waiter.NewChannelEntry(inCh) - inFile.EventRegister(&inW, eventMaskRead) - defer inFile.EventUnregister(&inW) - continue // Need to refresh readiness. + // Copy data. + var ( + n int64 + err error + ) + dw := dualWaiter{ + inFile: inFile, + outFile: outFile, + } + defer dw.destroy() + outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) + // Reading from input file should never block, since it is regular or + // block device. We only need to check if writing to the output file + // can block. + nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0 + if outIsPipe { + for n < count { + var spliceN int64 + if offset != -1 { + spliceN, err = inFile.PRead(t, outPipeFD.IOSequence(count), offset, vfs.ReadOptions{}) + offset += spliceN + } else { + spliceN, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{}) } - if err := t.Block(inCh); err != nil { - return 0, nil, err + n += spliceN + if err == syserror.ErrWouldBlock && !nonBlock { + err = dw.waitForBoth(t) + } + if err != nil { + break } } - if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { - if outCh == nil { - outCh = make(chan struct{}, 1) - outW, _ := waiter.NewChannelEntry(outCh) - outFile.EventRegister(&outW, eventMaskWrite) - defer outFile.EventUnregister(&outW) - continue // Need to refresh readiness. + } else { + // Read inFile to buffer, then write the contents to outFile. + buf := make([]byte, count) + for n < count { + var readN int64 + if offset != -1 { + readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{}) + offset += readN + } else { + readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) + } + if readN == 0 && err == io.EOF { + // We reached the end of the file. Eat the + // error and exit the loop. + err = nil + break } - if err := t.Block(outCh); err != nil { - return 0, nil, err + n += readN + if err != nil { + break + } + + // Write all of the bytes that we read. This may need + // multiple write calls to complete. + wbuf := buf[:n] + for len(wbuf) > 0 { + var writeN int64 + writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{}) + wbuf = wbuf[writeN:] + if err == syserror.ErrWouldBlock && !nonBlock { + err = dw.waitForOut(t) + } + if err != nil { + // We didn't complete the write. Only + // report the bytes that were actually + // written, and rewind the offset. + notWritten := int64(len(wbuf)) + n -= notWritten + if offset != -1 { + offset -= notWritten + } + break + } + } + if err == syserror.ErrWouldBlock && !nonBlock { + err = dw.waitForBoth(t) } + if err != nil { + break + } + } + } + + if offsetAddr != 0 { + // Copy out the new offset. + if _, err := t.CopyOut(offsetAddr, offset); err != nil { + return 0, nil, err + } + } + + if n == 0 { + return 0, nil, err + } + + inFile.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + return uintptr(n), nil, nil +} + +// dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not +// thread-safe, and does not take a reference on the vfs.FileDescriptions. +// +// Users must call destroy() when finished. +type dualWaiter struct { + inFile *vfs.FileDescription + outFile *vfs.FileDescription + + inW waiter.Entry + inCh chan struct{} + outW waiter.Entry + outCh chan struct{} +} + +// waitForBoth waits for both dw.inFile and dw.outFile to be ready. +func (dw *dualWaiter) waitForBoth(t *kernel.Task) error { + if dw.inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { + if dw.inCh == nil { + dw.inW, dw.inCh = waiter.NewChannelEntry(nil) + dw.inFile.EventRegister(&dw.inW, eventMaskRead) + // We might be ready now. Try again before blocking. + return nil + } + if err := t.Block(dw.inCh); err != nil { + return err + } + } + return dw.waitForOut(t) +} + +// waitForOut waits for dw.outfile to be read. +func (dw *dualWaiter) waitForOut(t *kernel.Task) error { + if dw.outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { + if dw.outCh == nil { + dw.outW, dw.outCh = waiter.NewChannelEntry(nil) + dw.outFile.EventRegister(&dw.outW, eventMaskWrite) + // We might be ready now. Try again before blocking. + return nil } + if err := t.Block(dw.outCh); err != nil { + return err + } + } + return nil +} + +// destroy cleans up resources help by dw. No more calls to wait* can occur +// after destroy is called. +func (dw *dualWaiter) destroy() { + if dw.inCh != nil { + dw.inFile.EventUnregister(&dw.inW) + dw.inCh = nil + } + if dw.outCh != nil { + dw.outFile.EventUnregister(&dw.outW) + dw.outCh = nil } + dw.inFile = nil + dw.outFile = nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 8f497ecc7..c576d9475 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -44,7 +44,7 @@ func Override() { s.Table[23] = syscalls.Supported("select", Select) s.Table[32] = syscalls.Supported("dup", Dup) s.Table[33] = syscalls.Supported("dup2", Dup2) - delete(s.Table, 40) // sendfile + s.Table[40] = syscalls.Supported("sendfile", Sendfile) s.Table[41] = syscalls.Supported("socket", Socket) s.Table[42] = syscalls.Supported("connect", Connect) s.Table[43] = syscalls.Supported("accept", Accept) @@ -62,7 +62,7 @@ func Override() { s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt) s.Table[59] = syscalls.Supported("execve", Execve) s.Table[72] = syscalls.Supported("fcntl", Fcntl) - s.Table[73] = syscalls.Supported("fcntl", Flock) + s.Table[73] = syscalls.Supported("flock", Flock) s.Table[74] = syscalls.Supported("fsync", Fsync) s.Table[75] = syscalls.Supported("fdatasync", Fdatasync) s.Table[76] = syscalls.Supported("truncate", Truncate) @@ -163,6 +163,106 @@ func Override() { // Override ARM64. s = linux.ARM64 + s.Table[5] = syscalls.Supported("setxattr", Setxattr) + s.Table[6] = syscalls.Supported("lsetxattr", Lsetxattr) + s.Table[7] = syscalls.Supported("fsetxattr", Fsetxattr) + s.Table[8] = syscalls.Supported("getxattr", Getxattr) + s.Table[9] = syscalls.Supported("lgetxattr", Lgetxattr) + s.Table[10] = syscalls.Supported("fgetxattr", Fgetxattr) + s.Table[11] = syscalls.Supported("listxattr", Listxattr) + s.Table[12] = syscalls.Supported("llistxattr", Llistxattr) + s.Table[13] = syscalls.Supported("flistxattr", Flistxattr) + s.Table[14] = syscalls.Supported("removexattr", Removexattr) + s.Table[15] = syscalls.Supported("lremovexattr", Lremovexattr) + s.Table[16] = syscalls.Supported("fremovexattr", Fremovexattr) + s.Table[17] = syscalls.Supported("getcwd", Getcwd) + s.Table[19] = syscalls.Supported("eventfd2", Eventfd2) + s.Table[20] = syscalls.Supported("epoll_create1", EpollCreate1) + s.Table[21] = syscalls.Supported("epoll_ctl", EpollCtl) + s.Table[22] = syscalls.Supported("epoll_pwait", EpollPwait) + s.Table[23] = syscalls.Supported("dup", Dup) + s.Table[24] = syscalls.Supported("dup3", Dup3) + s.Table[25] = syscalls.Supported("fcntl", Fcntl) + s.Table[26] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil) + s.Table[27] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil) + s.Table[28] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil) + s.Table[29] = syscalls.Supported("ioctl", Ioctl) + s.Table[32] = syscalls.Supported("flock", Flock) + s.Table[33] = syscalls.Supported("mknodat", Mknodat) + s.Table[34] = syscalls.Supported("mkdirat", Mkdirat) + s.Table[35] = syscalls.Supported("unlinkat", Unlinkat) + s.Table[36] = syscalls.Supported("symlinkat", Symlinkat) + s.Table[37] = syscalls.Supported("linkat", Linkat) + s.Table[38] = syscalls.Supported("renameat", Renameat) + s.Table[39] = syscalls.Supported("umount2", Umount2) + s.Table[40] = syscalls.Supported("mount", Mount) + s.Table[43] = syscalls.Supported("statfs", Statfs) + s.Table[44] = syscalls.Supported("fstatfs", Fstatfs) + s.Table[45] = syscalls.Supported("truncate", Truncate) + s.Table[46] = syscalls.Supported("ftruncate", Ftruncate) + s.Table[48] = syscalls.Supported("faccessat", Faccessat) + s.Table[49] = syscalls.Supported("chdir", Chdir) + s.Table[50] = syscalls.Supported("fchdir", Fchdir) + s.Table[51] = syscalls.Supported("chroot", Chroot) + s.Table[52] = syscalls.Supported("fchmod", Fchmod) + s.Table[53] = syscalls.Supported("fchmodat", Fchmodat) + s.Table[54] = syscalls.Supported("fchownat", Fchownat) + s.Table[55] = syscalls.Supported("fchown", Fchown) + s.Table[56] = syscalls.Supported("openat", Openat) + s.Table[57] = syscalls.Supported("close", Close) + s.Table[59] = syscalls.Supported("pipe2", Pipe2) + s.Table[61] = syscalls.Supported("getdents64", Getdents64) + s.Table[62] = syscalls.Supported("lseek", Lseek) s.Table[63] = syscalls.Supported("read", Read) + s.Table[64] = syscalls.Supported("write", Write) + s.Table[65] = syscalls.Supported("readv", Readv) + s.Table[66] = syscalls.Supported("writev", Writev) + s.Table[67] = syscalls.Supported("pread64", Pread64) + s.Table[68] = syscalls.Supported("pwrite64", Pwrite64) + s.Table[69] = syscalls.Supported("preadv", Preadv) + s.Table[70] = syscalls.Supported("pwritev", Pwritev) + s.Table[72] = syscalls.Supported("pselect", Pselect) + s.Table[73] = syscalls.Supported("ppoll", Ppoll) + s.Table[74] = syscalls.Supported("signalfd4", Signalfd4) + s.Table[76] = syscalls.Supported("splice", Splice) + s.Table[77] = syscalls.Supported("tee", Tee) + s.Table[78] = syscalls.Supported("readlinkat", Readlinkat) + s.Table[80] = syscalls.Supported("fstat", Fstat) + s.Table[81] = syscalls.Supported("sync", Sync) + s.Table[82] = syscalls.Supported("fsync", Fsync) + s.Table[83] = syscalls.Supported("fdatasync", Fdatasync) + s.Table[84] = syscalls.Supported("sync_file_range", SyncFileRange) + s.Table[85] = syscalls.Supported("timerfd_create", TimerfdCreate) + s.Table[86] = syscalls.Supported("timerfd_settime", TimerfdSettime) + s.Table[87] = syscalls.Supported("timerfd_gettime", TimerfdGettime) + s.Table[88] = syscalls.Supported("utimensat", Utimensat) + s.Table[198] = syscalls.Supported("socket", Socket) + s.Table[199] = syscalls.Supported("socketpair", SocketPair) + s.Table[200] = syscalls.Supported("bind", Bind) + s.Table[201] = syscalls.Supported("listen", Listen) + s.Table[202] = syscalls.Supported("accept", Accept) + s.Table[203] = syscalls.Supported("connect", Connect) + s.Table[204] = syscalls.Supported("getsockname", GetSockName) + s.Table[205] = syscalls.Supported("getpeername", GetPeerName) + s.Table[206] = syscalls.Supported("sendto", SendTo) + s.Table[207] = syscalls.Supported("recvfrom", RecvFrom) + s.Table[208] = syscalls.Supported("setsockopt", SetSockOpt) + s.Table[209] = syscalls.Supported("getsockopt", GetSockOpt) + s.Table[210] = syscalls.Supported("shutdown", Shutdown) + s.Table[211] = syscalls.Supported("sendmsg", SendMsg) + s.Table[212] = syscalls.Supported("recvmsg", RecvMsg) + s.Table[221] = syscalls.Supported("execve", Execve) + s.Table[222] = syscalls.Supported("mmap", Mmap) + s.Table[242] = syscalls.Supported("accept4", Accept4) + s.Table[243] = syscalls.Supported("recvmmsg", RecvMMsg) + s.Table[267] = syscalls.Supported("syncfs", Syncfs) + s.Table[269] = syscalls.Supported("sendmmsg", SendMMsg) + s.Table[276] = syscalls.Supported("renameat2", Renameat2) + s.Table[279] = syscalls.Supported("memfd_create", MemfdCreate) + s.Table[281] = syscalls.Supported("execveat", Execveat) + s.Table[286] = syscalls.Supported("preadv2", Preadv2) + s.Table[287] = syscalls.Supported("pwritev2", Pwritev2) + s.Table[291] = syscalls.Supported("statx", Statx) + s.Init() } diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index f223aeda8..dfc8573fd 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -79,6 +79,17 @@ type MountFlags struct { // NoATime is equivalent to MS_NOATIME and indicates that the // filesystem should not update access time in-place. NoATime bool + + // NoDev is equivalent to MS_NODEV and indicates that the + // filesystem should not allow access to devices (special files). + // TODO(gVisor.dev/issue/3186): respect this flag in non FUSE + // filesystems. + NoDev bool + + // NoSUID is equivalent to MS_NOSUID and indicates that the + // filesystem should not honor set-user-ID and set-group-ID bits or + // file capabilities when executing programs. + NoSUID bool } // MountOptions contains options to VirtualFilesystem.MountAt(). @@ -153,6 +164,12 @@ type SetStatOptions struct { // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask // instead). Stat linux.Statx + + // NeedWritePerm indicates that write permission on the file is needed for + // this operation. This is needed for truncate(2) (note that ftruncate(2) + // does not require the same check--instead, it checks that the fd is + // writable). + NeedWritePerm bool } // BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt() diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index 9cb050597..33389c1df 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -183,7 +183,8 @@ func MayWriteFileWithOpenFlags(flags uint32) bool { // CheckSetStat checks that creds has permission to change the metadata of a // file with the given permissions, UID, and GID as specified by stat, subject // to the rules of Linux's fs/attr.c:setattr_prepare(). -func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { +func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOptions, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { + stat := &opts.Stat if stat.Mask&linux.STATX_SIZE != 0 { limit, err := CheckLimit(ctx, 0, int64(stat.Size)) if err != nil { @@ -215,6 +216,11 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat return syserror.EPERM } } + if opts.NeedWritePerm && !creds.HasCapability(linux.CAP_DAC_OVERRIDE) { + if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil { + return err + } + } if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 { if !CanActAsOwner(creds, kuid) { if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) || diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go index 7908c5744..1a631b31a 100644 --- a/pkg/tcpip/header/icmpv4.go +++ b/pkg/tcpip/header/icmpv4.go @@ -72,6 +72,7 @@ const ( // Values for ICMP code as defined in RFC 792. const ( ICMPv4TTLExceeded = 0 + ICMPv4HostUnreachable = 1 ICMPv4PortUnreachable = 3 ICMPv4FragmentationNeeded = 4 ) diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go index c7ee2de57..a13b4b809 100644 --- a/pkg/tcpip/header/icmpv6.go +++ b/pkg/tcpip/header/icmpv6.go @@ -110,9 +110,16 @@ const ( ICMPv6RedirectMsg ICMPv6Type = 137 ) -// Values for ICMP code as defined in RFC 4443. +// Values for ICMP destination unreachable code as defined in RFC 4443 section +// 3.1. const ( - ICMPv6PortUnreachable = 4 + ICMPv6NetworkUnreachable = 0 + ICMPv6Prohibited = 1 + ICMPv6BeyondScope = 2 + ICMPv6AddressUnreachable = 3 + ICMPv6PortUnreachable = 4 + ICMPv6Policy = 5 + ICMPv6RejectRoute = 6 ) // Type is the ICMP type field. diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go index a2bb773d4..e12a5929b 100644 --- a/pkg/tcpip/link/channel/channel.go +++ b/pkg/tcpip/link/channel/channel.go @@ -302,3 +302,7 @@ func (e *Endpoint) RemoveNotify(handle *NotificationHandle) { func (*Endpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareNone } + +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (e *Endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { +} diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 6aa1badc7..c18bb91fb 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -386,26 +386,33 @@ const ( _VIRTIO_NET_HDR_GSO_TCPV6 = 4 ) -// WritePacket writes outbound packets to the file descriptor. If it is not -// currently writable, the packet is dropped. -func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { if e.hdrSize > 0 { // Add ethernet header if needed. eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize)) pkt.LinkHeader = buffer.View(eth) ethHdr := &header.EthernetFields{ - DstAddr: r.RemoteLinkAddress, + DstAddr: remote, Type: protocol, } // Preserve the src address if it's set in the route. - if r.LocalLinkAddress != "" { - ethHdr.SrcAddr = r.LocalLinkAddress + if local != "" { + ethHdr.SrcAddr = local } else { ethHdr.SrcAddr = e.addr } eth.Encode(ethHdr) } +} + +// WritePacket writes outbound packets to the file descriptor. If it is not +// currently writable, the packet is dropped. +func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { + if e.hdrSize > 0 { + e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt) + } var builder iovec.Builder @@ -448,22 +455,8 @@ func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tc // Send a batch of packets through batchFD. mmsgHdrs := make([]rawfile.MMsgHdr, 0, len(batch)) for _, pkt := range batch { - var eth header.Ethernet if e.hdrSize > 0 { - // Add ethernet header if needed. - eth = make(header.Ethernet, header.EthernetMinimumSize) - ethHdr := &header.EthernetFields{ - DstAddr: pkt.EgressRoute.RemoteLinkAddress, - Type: pkt.NetworkProtocolNumber, - } - - // Preserve the src address if it's set in the route. - if pkt.EgressRoute.LocalLinkAddress != "" { - ethHdr.SrcAddr = pkt.EgressRoute.LocalLinkAddress - } else { - ethHdr.SrcAddr = e.addr - } - eth.Encode(ethHdr) + e.AddHeader(pkt.EgressRoute.LocalLinkAddress, pkt.EgressRoute.RemoteLinkAddress, pkt.NetworkProtocolNumber, pkt) } var vnetHdrBuf []byte @@ -493,7 +486,6 @@ func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tc var builder iovec.Builder builder.Add(vnetHdrBuf) - builder.Add(eth) builder.Add(pkt.Header.View()) for _, v := range pkt.Data.Views() { builder.Add(v) diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go index 4bad930c7..7b995b85a 100644 --- a/pkg/tcpip/link/fdbased/endpoint_test.go +++ b/pkg/tcpip/link/fdbased/endpoint_test.go @@ -107,6 +107,10 @@ func (c *context) DeliverNetworkPacket(remote tcpip.LinkAddress, local tcpip.Lin c.ch <- packetInfo{remote, protocol, pkt} } +func (c *context) DeliverOutboundPacket(remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + panic("unimplemented") +} + func TestNoEthernetProperties(t *testing.T) { c := newContext(t, &Options{MTU: mtu}) defer c.cleanup() @@ -510,6 +514,10 @@ func (d *fakeNetworkDispatcher) DeliverNetworkPacket(remote, local tcpip.LinkAdd d.pkts = append(d.pkts, pkt) } +func (d *fakeNetworkDispatcher) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + panic("unimplemented") +} + func TestDispatchPacketFormat(t *testing.T) { for _, test := range []struct { name string diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go index 3b17d8c28..781cdd317 100644 --- a/pkg/tcpip/link/loopback/loopback.go +++ b/pkg/tcpip/link/loopback/loopback.go @@ -118,3 +118,6 @@ func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { func (*endpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareLoopback } + +func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { +} diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go index c305d9e86..56a611825 100644 --- a/pkg/tcpip/link/muxed/injectable.go +++ b/pkg/tcpip/link/muxed/injectable.go @@ -135,6 +135,10 @@ func (*InjectableEndpoint) ARPHardwareType() header.ARPHardwareType { panic("unsupported operation") } +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (*InjectableEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { +} + // NewInjectableEndpoint creates a new multi-endpoint injectable endpoint. func NewInjectableEndpoint(routes map[tcpip.Address]stack.InjectableLinkEndpoint) *InjectableEndpoint { return &InjectableEndpoint{ diff --git a/pkg/tcpip/link/nested/nested.go b/pkg/tcpip/link/nested/nested.go index 328bd048e..d40de54df 100644 --- a/pkg/tcpip/link/nested/nested.go +++ b/pkg/tcpip/link/nested/nested.go @@ -61,6 +61,16 @@ func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protoco } } +// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket. +func (e *Endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + e.mu.RLock() + d := e.dispatcher + e.mu.RUnlock() + if d != nil { + d.DeliverOutboundPacket(remote, local, protocol, pkt) + } +} + // Attach implements stack.LinkEndpoint. func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) { e.mu.Lock() @@ -135,3 +145,8 @@ func (e *Endpoint) GSOMaxSize() uint32 { func (e *Endpoint) ARPHardwareType() header.ARPHardwareType { return e.child.ARPHardwareType() } + +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (e *Endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + e.child.AddHeader(local, remote, protocol, pkt) +} diff --git a/pkg/tcpip/link/nested/nested_test.go b/pkg/tcpip/link/nested/nested_test.go index c1a219f02..7d9249c1c 100644 --- a/pkg/tcpip/link/nested/nested_test.go +++ b/pkg/tcpip/link/nested/nested_test.go @@ -55,6 +55,10 @@ func (d *counterDispatcher) DeliverNetworkPacket(tcpip.LinkAddress, tcpip.LinkAd d.count++ } +func (d *counterDispatcher) DeliverOutboundPacket(tcpip.LinkAddress, tcpip.LinkAddress, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) { + panic("unimplemented") +} + func TestNestedLinkEndpoint(t *testing.T) { const emptyAddress = tcpip.LinkAddress("") diff --git a/pkg/tcpip/link/packetsocket/BUILD b/pkg/tcpip/link/packetsocket/BUILD new file mode 100644 index 000000000..6fff160ce --- /dev/null +++ b/pkg/tcpip/link/packetsocket/BUILD @@ -0,0 +1,14 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "packetsocket", + srcs = ["endpoint.go"], + visibility = ["//visibility:public"], + deps = [ + "//pkg/tcpip", + "//pkg/tcpip/link/nested", + "//pkg/tcpip/stack", + ], +) diff --git a/pkg/tcpip/link/packetsocket/endpoint.go b/pkg/tcpip/link/packetsocket/endpoint.go new file mode 100644 index 000000000..3922c2a04 --- /dev/null +++ b/pkg/tcpip/link/packetsocket/endpoint.go @@ -0,0 +1,50 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package packetsocket provides a link layer endpoint that provides the ability +// to loop outbound packets to any AF_PACKET sockets that may be interested in +// the outgoing packet. +package packetsocket + +import ( + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/nested" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +type endpoint struct { + nested.Endpoint +} + +// New creates a new packetsocket LinkEndpoint. +func New(lower stack.LinkEndpoint) stack.LinkEndpoint { + e := &endpoint{} + e.Endpoint.Init(lower, e) + return e +} + +// WritePacket implements stack.LinkEndpoint.WritePacket. +func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { + e.Endpoint.DeliverOutboundPacket(r.RemoteLinkAddress, r.LocalLinkAddress, protocol, pkt) + return e.Endpoint.WritePacket(r, gso, protocol, pkt) +} + +// WritePackets implements stack.LinkEndpoint.WritePackets. +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, proto tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + e.Endpoint.DeliverOutboundPacket(pkt.EgressRoute.RemoteLinkAddress, pkt.EgressRoute.LocalLinkAddress, pkt.NetworkProtocolNumber, pkt) + } + + return e.Endpoint.WritePackets(r, gso, pkts, proto) +} diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go index c84fe1bb9..467083239 100644 --- a/pkg/tcpip/link/qdisc/fifo/endpoint.go +++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go @@ -107,6 +107,11 @@ func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protoco e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt) } +// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket. +func (e *endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + e.dispatcher.DeliverOutboundPacket(remote, local, protocol, pkt) +} + // Attach implements stack.LinkEndpoint.Attach. func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { e.dispatcher = dispatcher @@ -194,6 +199,8 @@ func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketB // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { + // TODO(gvisor.dev/issue/3267/): Queue these packets as well once + // WriteRawPacket takes PacketBuffer instead of VectorisedView. return e.lower.WriteRawPacket(vv) } @@ -213,3 +220,8 @@ func (e *endpoint) Wait() { func (e *endpoint) ARPHardwareType() header.ARPHardwareType { return e.lower.ARPHardwareType() } + +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + e.lower.AddHeader(local, remote, protocol, pkt) +} diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go index a36862c67..507c76b76 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem.go +++ b/pkg/tcpip/link/sharedmem/sharedmem.go @@ -183,22 +183,29 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress { return e.addr } -// WritePacket writes outbound packets to the file descriptor. If it is not -// currently writable, the packet is dropped. -func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { - // Add the ethernet header here. +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + // Add ethernet header if needed. eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize)) pkt.LinkHeader = buffer.View(eth) ethHdr := &header.EthernetFields{ - DstAddr: r.RemoteLinkAddress, + DstAddr: remote, Type: protocol, } - if r.LocalLinkAddress != "" { - ethHdr.SrcAddr = r.LocalLinkAddress + + // Preserve the src address if it's set in the route. + if local != "" { + ethHdr.SrcAddr = local } else { ethHdr.SrcAddr = e.addr } eth.Encode(ethHdr) +} + +// WritePacket writes outbound packets to the file descriptor. If it is not +// currently writable, the packet is dropped. +func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { + e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt) v := pkt.Data.ToView() // Transmit the packet. diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go index 28a2e88ba..8f3cd9449 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem_test.go +++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go @@ -143,6 +143,10 @@ func (c *testContext) DeliverNetworkPacket(remoteLinkAddr, localLinkAddr tcpip.L c.packetCh <- struct{}{} } +func (c *testContext) DeliverOutboundPacket(remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + panic("unimplemented") +} + func (c *testContext) cleanup() { c.ep.Close() closeFDs(&c.txCfg) diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go index d9cd4e83a..509076643 100644 --- a/pkg/tcpip/link/sniffer/sniffer.go +++ b/pkg/tcpip/link/sniffer/sniffer.go @@ -123,6 +123,11 @@ func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protoco e.Endpoint.DeliverNetworkPacket(remote, local, protocol, pkt) } +// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket. +func (e *endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + e.Endpoint.DeliverOutboundPacket(remote, local, protocol, pkt) +} + func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { writer := e.writer if writer == nil && atomic.LoadUint32(&LogPackets) == 1 { diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go index 47446efec..04ae58e59 100644 --- a/pkg/tcpip/link/tun/device.go +++ b/pkg/tcpip/link/tun/device.go @@ -272,21 +272,9 @@ func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) { if d.hasFlags(linux.IFF_TAP) { // Add ethernet header if not provided. if info.Pkt.LinkHeader == nil { - hdr := &header.EthernetFields{ - SrcAddr: info.Route.LocalLinkAddress, - DstAddr: info.Route.RemoteLinkAddress, - Type: info.Proto, - } - if hdr.SrcAddr == "" { - hdr.SrcAddr = d.endpoint.LinkAddress() - } - - eth := make(header.Ethernet, header.EthernetMinimumSize) - eth.Encode(hdr) - vv.AppendView(buffer.View(eth)) - } else { - vv.AppendView(info.Pkt.LinkHeader) + d.endpoint.AddHeader(info.Route.LocalLinkAddress, info.Route.RemoteLinkAddress, info.Proto, info.Pkt) } + vv.AppendView(info.Pkt.LinkHeader) } // Append upper headers. @@ -366,3 +354,30 @@ func (e *tunEndpoint) ARPHardwareType() header.ARPHardwareType { } return header.ARPHardwareNone } + +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (e *tunEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + if !e.isTap { + return + } + eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize)) + pkt.LinkHeader = buffer.View(eth) + hdr := &header.EthernetFields{ + SrcAddr: local, + DstAddr: remote, + Type: protocol, + } + if hdr.SrcAddr == "" { + hdr.SrcAddr = e.LinkAddress() + } + + eth.Encode(hdr) +} + +// MaxHeaderLength returns the maximum size of the link layer header. +func (e *tunEndpoint) MaxHeaderLength() uint16 { + if e.isTap { + return header.EthernetMinimumSize + } + return 0 +} diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go index 24a8dc2eb..b152a0f26 100644 --- a/pkg/tcpip/link/waitable/waitable.go +++ b/pkg/tcpip/link/waitable/waitable.go @@ -60,6 +60,15 @@ func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protoco e.dispatchGate.Leave() } +// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket. +func (e *Endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + if !e.dispatchGate.Enter() { + return + } + e.dispatcher.DeliverOutboundPacket(remote, local, protocol, pkt) + e.dispatchGate.Leave() +} + // Attach implements stack.LinkEndpoint.Attach. It saves the dispatcher and // registers with the lower endpoint as its dispatcher so that "e" is called // for inbound packets. @@ -153,3 +162,8 @@ func (e *Endpoint) Wait() {} func (e *Endpoint) ARPHardwareType() header.ARPHardwareType { return e.lower.ARPHardwareType() } + +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (e *Endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + e.lower.AddHeader(local, remote, protocol, pkt) +} diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go index ffb2354be..c448a888f 100644 --- a/pkg/tcpip/link/waitable/waitable_test.go +++ b/pkg/tcpip/link/waitable/waitable_test.go @@ -40,6 +40,10 @@ func (e *countedEndpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, e.dispatchCount++ } +func (e *countedEndpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + panic("unimplemented") +} + func (e *countedEndpoint) Attach(dispatcher stack.NetworkDispatcher) { e.attachCount++ e.dispatcher = dispatcher @@ -90,6 +94,11 @@ func (*countedEndpoint) ARPHardwareType() header.ARPHardwareType { // Wait implements stack.LinkEndpoint.Wait. func (*countedEndpoint) Wait() {} +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (e *countedEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + panic("unimplemented") +} + func TestWaitWrite(t *testing.T) { ep := &countedEndpoint{} wep := New(ep) diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index b0f57040c..31a242482 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -160,9 +160,12 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { } // LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest. -func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error { +func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP stack.LinkEndpoint) *tcpip.Error { r := &stack.Route{ - RemoteLinkAddress: header.EthernetBroadcastAddress, + RemoteLinkAddress: remoteLinkAddr, + } + if len(r.RemoteLinkAddress) == 0 { + r.RemoteLinkAddress = header.EthernetBroadcastAddress } hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.ARPSize) diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go index 66e67429c..a35a64a0f 100644 --- a/pkg/tcpip/network/arp/arp_test.go +++ b/pkg/tcpip/network/arp/arp_test.go @@ -32,10 +32,14 @@ import ( ) const ( - stackLinkAddr = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c") - stackAddr1 = tcpip.Address("\x0a\x00\x00\x01") - stackAddr2 = tcpip.Address("\x0a\x00\x00\x02") - stackAddrBad = tcpip.Address("\x0a\x00\x00\x03") + stackLinkAddr1 = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c") + stackLinkAddr2 = tcpip.LinkAddress("\x0b\x0b\x0c\x0c\x0d\x0d") + stackAddr1 = tcpip.Address("\x0a\x00\x00\x01") + stackAddr2 = tcpip.Address("\x0a\x00\x00\x02") + stackAddrBad = tcpip.Address("\x0a\x00\x00\x03") + + defaultChannelSize = 1 + defaultMTU = 65536 ) type testContext struct { @@ -50,8 +54,7 @@ func newTestContext(t *testing.T) *testContext { TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol4()}, }) - const defaultMTU = 65536 - ep := channel.New(256, defaultMTU, stackLinkAddr) + ep := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr1) wep := stack.LinkEndpoint(ep) if testing.Verbose() { @@ -119,7 +122,7 @@ func TestDirectRequest(t *testing.T) { if !rep.IsValid() { t.Fatalf("invalid ARP response pi.Pkt.Header.UsedLength()=%d", pi.Pkt.Header.UsedLength()) } - if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want { + if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr1; got != want { t.Errorf("got HardwareAddressSender = %s, want = %s", got, want) } if got, want := tcpip.Address(rep.ProtocolAddressSender()), tcpip.Address(h.ProtocolAddressTarget()); got != want { @@ -144,3 +147,44 @@ func TestDirectRequest(t *testing.T) { t.Errorf("stackAddrBad: unexpected packet sent, Proto=%v", pkt.Proto) } } + +func TestLinkAddressRequest(t *testing.T) { + tests := []struct { + name string + remoteLinkAddr tcpip.LinkAddress + expectLinkAddr tcpip.LinkAddress + }{ + { + name: "Unicast", + remoteLinkAddr: stackLinkAddr2, + expectLinkAddr: stackLinkAddr2, + }, + { + name: "Multicast", + remoteLinkAddr: "", + expectLinkAddr: header.EthernetBroadcastAddress, + }, + } + + for _, test := range tests { + p := arp.NewProtocol() + linkRes, ok := p.(stack.LinkAddressResolver) + if !ok { + t.Fatal("expected ARP protocol to implement stack.LinkAddressResolver") + } + + linkEP := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr1) + if err := linkRes.LinkAddressRequest(stackAddr1, stackAddr2, test.remoteLinkAddr, linkEP); err != nil { + t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s", stackAddr1, stackAddr2, test.remoteLinkAddr, err) + } + + pkt, ok := linkEP.Read() + if !ok { + t.Fatal("expected to send a link address request") + } + + if got, want := pkt.Route.RemoteLinkAddress, test.expectLinkAddr; got != want { + t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", got, want) + } + } +} diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go index a5b780ca2..615bae648 100644 --- a/pkg/tcpip/network/ip_test.go +++ b/pkg/tcpip/network/ip_test.go @@ -185,6 +185,11 @@ func (*testObject) ARPHardwareType() header.ARPHardwareType { panic("not implemented") } +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (*testObject) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + panic("not implemented") +} + func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) { s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()}, diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go index 1b67aa066..83e71cb8c 100644 --- a/pkg/tcpip/network/ipv4/icmp.go +++ b/pkg/tcpip/network/ipv4/icmp.go @@ -129,6 +129,9 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer) { pkt.Data.TrimFront(header.ICMPv4MinimumSize) switch h.Code() { + case header.ICMPv4HostUnreachable: + e.handleControl(stack.ControlNoRoute, 0, pkt) + case header.ICMPv4PortUnreachable: e.handleControl(stack.ControlPortUnreachable, 0, pkt) diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go index 3b79749b5..24600d877 100644 --- a/pkg/tcpip/network/ipv6/icmp.go +++ b/pkg/tcpip/network/ipv6/icmp.go @@ -128,6 +128,8 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme } pkt.Data.TrimFront(header.ICMPv6DstUnreachableMinimumSize) switch header.ICMPv6(hdr).Code() { + case header.ICMPv6NetworkUnreachable: + e.handleControl(stack.ControlNetworkUnreachable, 0, pkt) case header.ICMPv6PortUnreachable: e.handleControl(stack.ControlPortUnreachable, 0, pkt) } @@ -502,7 +504,7 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { } // LinkAddressRequest implements stack.LinkAddressResolver. -func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error { +func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP stack.LinkEndpoint) *tcpip.Error { snaddr := header.SolicitedNodeAddr(addr) // TODO(b/148672031): Use stack.FindRoute instead of manually creating the @@ -511,8 +513,12 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack. r := &stack.Route{ LocalAddress: localAddr, RemoteAddress: snaddr, - RemoteLinkAddress: header.EthernetAddressFromMulticastIPv6Address(snaddr), + RemoteLinkAddress: remoteLinkAddr, } + if len(r.RemoteLinkAddress) == 0 { + r.RemoteLinkAddress = header.EthernetAddressFromMulticastIPv6Address(snaddr) + } + hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize) pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize)) pkt.SetType(header.ICMPv6NeighborSolicit) diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go index 52a01b44e..f86aaed1d 100644 --- a/pkg/tcpip/network/ipv6/icmp_test.go +++ b/pkg/tcpip/network/ipv6/icmp_test.go @@ -34,6 +34,9 @@ const ( linkAddr0 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06") linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e") linkAddr2 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0f") + + defaultChannelSize = 1 + defaultMTU = 65536 ) var ( @@ -257,8 +260,7 @@ func newTestContext(t *testing.T) *testContext { }), } - const defaultMTU = 65536 - c.linkEP0 = channel.New(256, defaultMTU, linkAddr0) + c.linkEP0 = channel.New(defaultChannelSize, defaultMTU, linkAddr0) wrappedEP0 := stack.LinkEndpoint(endpointWithResolutionCapability{LinkEndpoint: c.linkEP0}) if testing.Verbose() { @@ -271,7 +273,7 @@ func newTestContext(t *testing.T) *testContext { t.Fatalf("AddAddress lladdr0: %v", err) } - c.linkEP1 = channel.New(256, defaultMTU, linkAddr1) + c.linkEP1 = channel.New(defaultChannelSize, defaultMTU, linkAddr1) wrappedEP1 := stack.LinkEndpoint(endpointWithResolutionCapability{LinkEndpoint: c.linkEP1}) if err := c.s1.CreateNIC(1, wrappedEP1); err != nil { t.Fatalf("CreateNIC failed: %v", err) @@ -951,3 +953,47 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) { }) } } + +func TestLinkAddressRequest(t *testing.T) { + snaddr := header.SolicitedNodeAddr(lladdr0) + mcaddr := header.EthernetAddressFromMulticastIPv6Address(snaddr) + + tests := []struct { + name string + remoteLinkAddr tcpip.LinkAddress + expectLinkAddr tcpip.LinkAddress + }{ + { + name: "Unicast", + remoteLinkAddr: linkAddr1, + expectLinkAddr: linkAddr1, + }, + { + name: "Multicast", + remoteLinkAddr: "", + expectLinkAddr: mcaddr, + }, + } + + for _, test := range tests { + p := NewProtocol() + linkRes, ok := p.(stack.LinkAddressResolver) + if !ok { + t.Fatalf("expected IPv6 protocol to implement stack.LinkAddressResolver") + } + + linkEP := channel.New(defaultChannelSize, defaultMTU, linkAddr0) + if err := linkRes.LinkAddressRequest(lladdr0, lladdr1, test.remoteLinkAddr, linkEP); err != nil { + t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s", lladdr0, lladdr1, test.remoteLinkAddr, err) + } + + pkt, ok := linkEP.Read() + if !ok { + t.Fatal("expected to send a link address request") + } + + if got, want := pkt.Route.RemoteLinkAddress, test.expectLinkAddr; got != want { + t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", got, want) + } + } +} diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go index d39baf620..559a1c4dd 100644 --- a/pkg/tcpip/stack/conntrack.go +++ b/pkg/tcpip/stack/conntrack.go @@ -49,7 +49,8 @@ const ( type manipType int const ( - manipDstPrerouting manipType = iota + manipNone manipType = iota + manipDstPrerouting manipDstOutput ) @@ -113,13 +114,11 @@ type conn struct { // update the state of tcb. It is immutable. tcbHook Hook - // mu protects tcb. + // mu protects all mutable state. mu sync.Mutex `state:"nosave"` - // tcb is TCB control block. It is used to keep track of states // of tcp connection and is protected by mu. tcb tcpconntrack.TCB - // lastUsed is the last time the connection saw a relevant packet, and // is updated by each packet on the connection. It is protected by mu. lastUsed time.Time `state:".(unixTime)"` @@ -141,8 +140,26 @@ func (cn *conn) timedOut(now time.Time) bool { return now.Sub(cn.lastUsed) > defaultTimeout } +// update the connection tracking state. +// +// Precondition: ct.mu must be held. +func (ct *conn) updateLocked(tcpHeader header.TCP, hook Hook) { + // Update the state of tcb. tcb assumes it's always initialized on the + // client. However, we only need to know whether the connection is + // established or not, so the client/server distinction isn't important. + // TODO(gvisor.dev/issue/170): Add support in tcpconntrack to handle + // other tcp states. + if ct.tcb.IsEmpty() { + ct.tcb.Init(tcpHeader) + } else if hook == ct.tcbHook { + ct.tcb.UpdateStateOutbound(tcpHeader) + } else { + ct.tcb.UpdateStateInbound(tcpHeader) + } +} + // ConnTrack tracks all connections created for NAT rules. Most users are -// expected to only call handlePacket and createConnFor. +// expected to only call handlePacket, insertRedirectConn, and maybeInsertNoop. // // ConnTrack keeps all connections in a slice of buckets, each of which holds a // linked list of tuples. This gives us some desirable properties: @@ -248,8 +265,7 @@ func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) { return nil, dirOriginal } -// createConnFor creates a new conn for pkt. -func (ct *ConnTrack) createConnFor(pkt *PacketBuffer, hook Hook, rt RedirectTarget) *conn { +func (ct *ConnTrack) insertRedirectConn(pkt *PacketBuffer, hook Hook, rt RedirectTarget) *conn { tid, err := packetToTupleID(pkt) if err != nil { return nil @@ -272,10 +288,15 @@ func (ct *ConnTrack) createConnFor(pkt *PacketBuffer, hook Hook, rt RedirectTarg manip = manipDstOutput } conn := newConn(tid, replyTID, manip, hook) + ct.insertConn(conn) + return conn +} +// insertConn inserts conn into the appropriate table bucket. +func (ct *ConnTrack) insertConn(conn *conn) { // Lock the buckets in the correct order. - tupleBucket := ct.bucket(tid) - replyBucket := ct.bucket(replyTID) + tupleBucket := ct.bucket(conn.original.tupleID) + replyBucket := ct.bucket(conn.reply.tupleID) ct.mu.RLock() defer ct.mu.RUnlock() if tupleBucket < replyBucket { @@ -289,22 +310,37 @@ func (ct *ConnTrack) createConnFor(pkt *PacketBuffer, hook Hook, rt RedirectTarg ct.buckets[tupleBucket].mu.Lock() } - // Add the tuple to the map. - ct.buckets[tupleBucket].tuples.PushFront(&conn.original) - ct.buckets[replyBucket].tuples.PushFront(&conn.reply) + // Now that we hold the locks, ensure the tuple hasn't been inserted by + // another thread. + alreadyInserted := false + for other := ct.buckets[tupleBucket].tuples.Front(); other != nil; other = other.Next() { + if other.tupleID == conn.original.tupleID { + alreadyInserted = true + break + } + } + + if !alreadyInserted { + // Add the tuple to the map. + ct.buckets[tupleBucket].tuples.PushFront(&conn.original) + ct.buckets[replyBucket].tuples.PushFront(&conn.reply) + } // Unlocking can happen in any order. ct.buckets[tupleBucket].mu.Unlock() if tupleBucket != replyBucket { ct.buckets[replyBucket].mu.Unlock() } - - return conn } // handlePacketPrerouting manipulates ports for packets in Prerouting hook. // TODO(gvisor.dev/issue/170): Change address for Prerouting hook. func handlePacketPrerouting(pkt *PacketBuffer, conn *conn, dir direction) { + // If this is a noop entry, don't do anything. + if conn.manip == manipNone { + return + } + netHeader := header.IPv4(pkt.NetworkHeader) tcpHeader := header.TCP(pkt.TransportHeader) @@ -322,12 +358,22 @@ func handlePacketPrerouting(pkt *PacketBuffer, conn *conn, dir direction) { netHeader.SetSourceAddress(conn.original.dstAddr) } + // TODO(gvisor.dev/issue/170): TCP checksums aren't usually validated + // on inbound packets, so we don't recalculate them. However, we should + // support cases when they are validated, e.g. when we can't offload + // receive checksumming. + netHeader.SetChecksum(0) netHeader.SetChecksum(^netHeader.CalculateChecksum()) } // handlePacketOutput manipulates ports for packets in Output hook. func handlePacketOutput(pkt *PacketBuffer, conn *conn, gso *GSO, r *Route, dir direction) { + // If this is a noop entry, don't do anything. + if conn.manip == manipNone { + return + } + netHeader := header.IPv4(pkt.NetworkHeader) tcpHeader := header.TCP(pkt.TransportHeader) @@ -362,20 +408,31 @@ func handlePacketOutput(pkt *PacketBuffer, conn *conn, gso *GSO, r *Route, dir d } // handlePacket will manipulate the port and address of the packet if the -// connection exists. -func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Route) { +// connection exists. Returns whether, after the packet traverses the tables, +// it should create a new entry in the table. +func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Route) bool { if pkt.NatDone { - return + return false } if hook != Prerouting && hook != Output { - return + return false + } + + // TODO(gvisor.dev/issue/170): Support other transport protocols. + if pkt.NetworkHeader == nil || header.IPv4(pkt.NetworkHeader).TransportProtocol() != header.TCPProtocolNumber { + return false } conn, dir := ct.connFor(pkt) + // Connection or Rule not found for the packet. if conn == nil { - // Connection not found for the packet or the packet is invalid. - return + return true + } + + tcpHeader := header.TCP(pkt.TransportHeader) + if tcpHeader == nil { + return false } switch hook { @@ -395,14 +452,39 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Rou // Mark the connection as having been used recently so it isn't reaped. conn.lastUsed = time.Now() // Update connection state. - if tcpHeader := header.TCP(pkt.TransportHeader); conn.tcb.IsEmpty() { - conn.tcb.Init(tcpHeader) - conn.tcbHook = hook - } else if hook == conn.tcbHook { - conn.tcb.UpdateStateOutbound(tcpHeader) - } else { - conn.tcb.UpdateStateInbound(tcpHeader) + conn.updateLocked(header.TCP(pkt.TransportHeader), hook) + + return false +} + +// maybeInsertNoop tries to insert a no-op connection entry to keep connections +// from getting clobbered when replies arrive. It only inserts if there isn't +// already a connection for pkt. +// +// This should be called after traversing iptables rules only, to ensure that +// pkt.NatDone is set correctly. +func (ct *ConnTrack) maybeInsertNoop(pkt *PacketBuffer, hook Hook) { + // If there were a rule applying to this packet, it would be marked + // with NatDone. + if pkt.NatDone { + return + } + + // We only track TCP connections. + if pkt.NetworkHeader == nil || header.IPv4(pkt.NetworkHeader).TransportProtocol() != header.TCPProtocolNumber { + return + } + + // This is the first packet we're seeing for the TCP connection. Insert + // the noop entry (an identity mapping) so that the response doesn't + // get NATed, breaking the connection. + tid, err := packetToTupleID(pkt) + if err != nil { + return } + conn := newConn(tid, tid.reply(), manipNone, hook) + conn.updateLocked(header.TCP(pkt.TransportHeader), hook) + ct.insertConn(conn) } // bucket gets the conntrack bucket for a tupleID. diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go index eefb4b07f..c962693f5 100644 --- a/pkg/tcpip/stack/forwarder_test.go +++ b/pkg/tcpip/stack/forwarder_test.go @@ -121,10 +121,12 @@ func (*fwdTestNetworkEndpoint) Close() {} type fwdTestNetworkProtocol struct { addrCache *linkAddrCache addrResolveDelay time.Duration - onLinkAddressResolved func(cache *linkAddrCache, addr tcpip.Address) + onLinkAddressResolved func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) onResolveStaticAddress func(tcpip.Address) (tcpip.LinkAddress, bool) } +var _ LinkAddressResolver = (*fwdTestNetworkProtocol)(nil) + func (f *fwdTestNetworkProtocol) Number() tcpip.NetworkProtocolNumber { return fwdTestNetNumber } @@ -174,10 +176,10 @@ func (f *fwdTestNetworkProtocol) Close() {} func (f *fwdTestNetworkProtocol) Wait() {} -func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP LinkEndpoint) *tcpip.Error { +func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error { if f.addrCache != nil && f.onLinkAddressResolved != nil { time.AfterFunc(f.addrResolveDelay, func() { - f.onLinkAddressResolved(f.addrCache, addr) + f.onLinkAddressResolved(f.addrCache, addr, remoteLinkAddr) }) } return nil @@ -307,6 +309,11 @@ func (*fwdTestLinkEndpoint) ARPHardwareType() header.ARPHardwareType { panic("not implemented") } +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (e *fwdTestLinkEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) { + panic("not implemented") +} + func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (ep1, ep2 *fwdTestLinkEndpoint) { // Create a stack with the network protocol and two NICs. s := New(Options{ @@ -400,7 +407,7 @@ func TestForwardingWithFakeResolver(t *testing.T) { // Create a network protocol with a fake resolver. proto := &fwdTestNetworkProtocol{ addrResolveDelay: 500 * time.Millisecond, - onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) { + onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) { // Any address will be resolved to the link address "c". cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c") }, @@ -458,7 +465,7 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) { // Create a network protocol with a fake resolver. proto := &fwdTestNetworkProtocol{ addrResolveDelay: 500 * time.Millisecond, - onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) { + onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) { // Only packets to address 3 will be resolved to the // link address "c". if addr == "\x03" { @@ -510,7 +517,7 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) { // Create a network protocol with a fake resolver. proto := &fwdTestNetworkProtocol{ addrResolveDelay: 500 * time.Millisecond, - onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) { + onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) { // Any packets will be resolved to the link address "c". cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c") }, @@ -554,7 +561,7 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) { // Create a network protocol with a fake resolver. proto := &fwdTestNetworkProtocol{ addrResolveDelay: 500 * time.Millisecond, - onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) { + onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) { // Any packets will be resolved to the link address "c". cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c") }, @@ -611,7 +618,7 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) { // Create a network protocol with a fake resolver. proto := &fwdTestNetworkProtocol{ addrResolveDelay: 500 * time.Millisecond, - onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) { + onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) { // Any packets will be resolved to the link address "c". cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c") }, diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go index bbf3b60e8..cbbae4224 100644 --- a/pkg/tcpip/stack/iptables.go +++ b/pkg/tcpip/stack/iptables.go @@ -22,22 +22,30 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/header" ) -// Table names. +// tableID is an index into IPTables.tables. +type tableID int + const ( - TablenameNat = "nat" - TablenameMangle = "mangle" - TablenameFilter = "filter" + natID tableID = iota + mangleID + filterID + numTables ) -// Chain names as defined by net/ipv4/netfilter/ip_tables.c. +// Table names. const ( - ChainNamePrerouting = "PREROUTING" - ChainNameInput = "INPUT" - ChainNameForward = "FORWARD" - ChainNameOutput = "OUTPUT" - ChainNamePostrouting = "POSTROUTING" + NATTable = "nat" + MangleTable = "mangle" + FilterTable = "filter" ) +// nameToID is immutable. +var nameToID = map[string]tableID{ + NATTable: natID, + MangleTable: mangleID, + FilterTable: filterID, +} + // HookUnset indicates that there is no hook set for an entrypoint or // underflow. const HookUnset = -1 @@ -48,11 +56,9 @@ const reaperDelay = 5 * time.Second // DefaultTables returns a default set of tables. Each chain is set to accept // all packets. func DefaultTables() *IPTables { - // TODO(gvisor.dev/issue/170): We may be able to swap out some strings for - // iotas. return &IPTables{ - tables: map[string]Table{ - TablenameNat: Table{ + tables: [numTables]Table{ + natID: Table{ Rules: []Rule{ Rule{Target: AcceptTarget{}}, Rule{Target: AcceptTarget{}}, @@ -60,60 +66,66 @@ func DefaultTables() *IPTables { Rule{Target: AcceptTarget{}}, Rule{Target: ErrorTarget{}}, }, - BuiltinChains: map[Hook]int{ + BuiltinChains: [NumHooks]int{ Prerouting: 0, Input: 1, + Forward: HookUnset, Output: 2, Postrouting: 3, }, - Underflows: map[Hook]int{ + Underflows: [NumHooks]int{ Prerouting: 0, Input: 1, + Forward: HookUnset, Output: 2, Postrouting: 3, }, - UserChains: map[string]int{}, }, - TablenameMangle: Table{ + mangleID: Table{ Rules: []Rule{ Rule{Target: AcceptTarget{}}, Rule{Target: AcceptTarget{}}, Rule{Target: ErrorTarget{}}, }, - BuiltinChains: map[Hook]int{ + BuiltinChains: [NumHooks]int{ Prerouting: 0, Output: 1, }, - Underflows: map[Hook]int{ - Prerouting: 0, - Output: 1, + Underflows: [NumHooks]int{ + Prerouting: 0, + Input: HookUnset, + Forward: HookUnset, + Output: 1, + Postrouting: HookUnset, }, - UserChains: map[string]int{}, }, - TablenameFilter: Table{ + filterID: Table{ Rules: []Rule{ Rule{Target: AcceptTarget{}}, Rule{Target: AcceptTarget{}}, Rule{Target: AcceptTarget{}}, Rule{Target: ErrorTarget{}}, }, - BuiltinChains: map[Hook]int{ - Input: 0, - Forward: 1, - Output: 2, + BuiltinChains: [NumHooks]int{ + Prerouting: HookUnset, + Input: 0, + Forward: 1, + Output: 2, + Postrouting: HookUnset, }, - Underflows: map[Hook]int{ - Input: 0, - Forward: 1, - Output: 2, + Underflows: [NumHooks]int{ + Prerouting: HookUnset, + Input: 0, + Forward: 1, + Output: 2, + Postrouting: HookUnset, }, - UserChains: map[string]int{}, }, }, - priorities: map[Hook][]string{ - Input: []string{TablenameNat, TablenameFilter}, - Prerouting: []string{TablenameMangle, TablenameNat}, - Output: []string{TablenameMangle, TablenameNat, TablenameFilter}, + priorities: [NumHooks][]tableID{ + Prerouting: []tableID{mangleID, natID}, + Input: []tableID{natID, filterID}, + Output: []tableID{mangleID, natID, filterID}, }, connections: ConnTrack{ seed: generateRandUint32(), @@ -127,51 +139,48 @@ func DefaultTables() *IPTables { func EmptyFilterTable() Table { return Table{ Rules: []Rule{}, - BuiltinChains: map[Hook]int{ - Input: HookUnset, - Forward: HookUnset, - Output: HookUnset, + BuiltinChains: [NumHooks]int{ + Prerouting: HookUnset, + Postrouting: HookUnset, }, - Underflows: map[Hook]int{ - Input: HookUnset, - Forward: HookUnset, - Output: HookUnset, + Underflows: [NumHooks]int{ + Prerouting: HookUnset, + Postrouting: HookUnset, }, - UserChains: map[string]int{}, } } -// EmptyNatTable returns a Table with no rules and the filter table chains +// EmptyNATTable returns a Table with no rules and the filter table chains // mapped to HookUnset. -func EmptyNatTable() Table { +func EmptyNATTable() Table { return Table{ Rules: []Rule{}, - BuiltinChains: map[Hook]int{ - Prerouting: HookUnset, - Input: HookUnset, - Output: HookUnset, - Postrouting: HookUnset, + BuiltinChains: [NumHooks]int{ + Forward: HookUnset, }, - Underflows: map[Hook]int{ - Prerouting: HookUnset, - Input: HookUnset, - Output: HookUnset, - Postrouting: HookUnset, + Underflows: [NumHooks]int{ + Forward: HookUnset, }, - UserChains: map[string]int{}, } } -// GetTable returns table by name. +// GetTable returns a table by name. func (it *IPTables) GetTable(name string) (Table, bool) { + id, ok := nameToID[name] + if !ok { + return Table{}, false + } it.mu.RLock() defer it.mu.RUnlock() - t, ok := it.tables[name] - return t, ok + return it.tables[id], true } // ReplaceTable replaces or inserts table by name. -func (it *IPTables) ReplaceTable(name string, table Table) { +func (it *IPTables) ReplaceTable(name string, table Table) *tcpip.Error { + id, ok := nameToID[name] + if !ok { + return tcpip.ErrInvalidOptionValue + } it.mu.Lock() defer it.mu.Unlock() // If iptables is being enabled, initialize the conntrack table and @@ -181,14 +190,8 @@ func (it *IPTables) ReplaceTable(name string, table Table) { it.startReaper(reaperDelay) } it.modified = true - it.tables[name] = table -} - -// GetPriorities returns slice of priorities associated with hook. -func (it *IPTables) GetPriorities(hook Hook) []string { - it.mu.RLock() - defer it.mu.RUnlock() - return it.priorities[hook] + it.tables[id] = table + return nil } // A chainVerdict is what a table decides should be done with a packet. @@ -223,11 +226,19 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr // Packets are manipulated only if connection and matching // NAT rule exists. - it.connections.handlePacket(pkt, hook, gso, r) + shouldTrack := it.connections.handlePacket(pkt, hook, gso, r) // Go through each table containing the hook. - for _, tablename := range it.GetPriorities(hook) { - table, _ := it.GetTable(tablename) + it.mu.RLock() + defer it.mu.RUnlock() + priorities := it.priorities[hook] + for _, tableID := range priorities { + // If handlePacket already NATed the packet, we don't need to + // check the NAT table. + if tableID == natID && pkt.NatDone { + continue + } + table := it.tables[tableID] ruleIdx := table.BuiltinChains[hook] switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict { // If the table returns Accept, move on to the next table. @@ -256,6 +267,20 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr } } + // If this connection should be tracked, try to add an entry for it. If + // traversing the nat table didn't end in adding an entry, + // maybeInsertNoop will add a no-op entry for the connection. This is + // needeed when establishing connections so that the SYN/ACK reply to an + // outgoing SYN is delivered to the correct endpoint rather than being + // redirected by a prerouting rule. + // + // From the iptables documentation: "If there is no rule, a `null' + // binding is created: this usually does not map the packet, but exists + // to ensure we don't map another stream over an existing one." + if shouldTrack { + it.connections.maybeInsertNoop(pkt, hook) + } + // Every table returned Accept. return true } diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go index d43f60c67..dc88033c7 100644 --- a/pkg/tcpip/stack/iptables_targets.go +++ b/pkg/tcpip/stack/iptables_targets.go @@ -153,7 +153,7 @@ func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso // Set up conection for matching NAT rule. Only the first // packet of the connection comes here. Other packets will be // manipulated in connection tracking. - if conn := ct.createConnFor(pkt, hook, rt); conn != nil { + if conn := ct.insertRedirectConn(pkt, hook, rt); conn != nil { ct.handlePacket(pkt, hook, gso, r) } default: diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go index eb70e3104..73274ada9 100644 --- a/pkg/tcpip/stack/iptables_types.go +++ b/pkg/tcpip/stack/iptables_types.go @@ -84,14 +84,14 @@ type IPTables struct { // mu protects tables, priorities, and modified. mu sync.RWMutex - // tables maps table names to tables. User tables have arbitrary names. - // mu needs to be locked for accessing. - tables map[string]Table + // tables maps tableIDs to tables. Holds builtin tables only, not user + // tables. mu must be locked for accessing. + tables [numTables]Table // priorities maps each hook to a list of table names. The order of the // list is the order in which each table should be visited for that // hook. mu needs to be locked for accessing. - priorities map[Hook][]string + priorities [NumHooks][]tableID // modified is whether tables have been modified at least once. It is // used to elide the iptables performance overhead for workloads that @@ -113,22 +113,20 @@ type Table struct { Rules []Rule // BuiltinChains maps builtin chains to their entrypoint rule in Rules. - BuiltinChains map[Hook]int + BuiltinChains [NumHooks]int // Underflows maps builtin chains to their underflow rule in Rules // (i.e. the rule to execute if the chain returns without a verdict). - Underflows map[Hook]int - - // UserChains holds user-defined chains for the keyed by name. Users - // can give their chains arbitrary names. - UserChains map[string]int + Underflows [NumHooks]int } // ValidHooks returns a bitmap of the builtin hooks for the given table. func (table *Table) ValidHooks() uint32 { hooks := uint32(0) - for hook := range table.BuiltinChains { - hooks |= 1 << hook + for hook, ruleIdx := range table.BuiltinChains { + if ruleIdx != HookUnset { + hooks |= 1 << hook + } } return hooks } diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go index 403557fd7..6f73a0ce4 100644 --- a/pkg/tcpip/stack/linkaddrcache.go +++ b/pkg/tcpip/stack/linkaddrcache.go @@ -244,7 +244,7 @@ func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes Link for i := 0; ; i++ { // Send link request, then wait for the timeout limit and check // whether the request succeeded. - linkRes.LinkAddressRequest(k.Addr, localAddr, linkEP) + linkRes.LinkAddressRequest(k.Addr, localAddr, "" /* linkAddr */, linkEP) select { case now := <-time.After(c.resolutionTimeout): diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go index 1baa498d0..b15b8d1cb 100644 --- a/pkg/tcpip/stack/linkaddrcache_test.go +++ b/pkg/tcpip/stack/linkaddrcache_test.go @@ -48,7 +48,7 @@ type testLinkAddressResolver struct { onLinkAddressRequest func() } -func (r *testLinkAddressResolver) LinkAddressRequest(addr, _ tcpip.Address, _ LinkEndpoint) *tcpip.Error { +func (r *testLinkAddressResolver) LinkAddressRequest(addr, _ tcpip.Address, _ tcpip.LinkAddress, _ LinkEndpoint) *tcpip.Error { time.AfterFunc(r.delay, func() { r.fakeRequest(addr) }) if f := r.onLinkAddressRequest; f != nil { f() diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go index e28c23d66..9dce11a97 100644 --- a/pkg/tcpip/stack/ndp.go +++ b/pkg/tcpip/stack/ndp.go @@ -469,7 +469,7 @@ type ndpState struct { rtrSolicit struct { // The timer used to send the next router solicitation message. - timer *time.Timer + timer tcpip.Timer // Used to let the Router Solicitation timer know that it has been stopped. // @@ -503,7 +503,7 @@ type ndpState struct { // to the DAD goroutine that DAD should stop. type dadState struct { // The DAD timer to send the next NS message, or resolve the address. - timer *time.Timer + timer tcpip.Timer // Used to let the DAD timer know that it has been stopped. // @@ -515,38 +515,38 @@ type dadState struct { // defaultRouterState holds data associated with a default router discovered by // a Router Advertisement (RA). type defaultRouterState struct { - // Timer to invalidate the default router. + // Job to invalidate the default router. // // Must not be nil. - invalidationTimer *tcpip.CancellableTimer + invalidationJob *tcpip.Job } // onLinkPrefixState holds data associated with an on-link prefix discovered by // a Router Advertisement's Prefix Information option (PI) when the NDP // configurations was configured to do so. type onLinkPrefixState struct { - // Timer to invalidate the on-link prefix. + // Job to invalidate the on-link prefix. // // Must not be nil. - invalidationTimer *tcpip.CancellableTimer + invalidationJob *tcpip.Job } // tempSLAACAddrState holds state associated with a temporary SLAAC address. type tempSLAACAddrState struct { - // Timer to deprecate the temporary SLAAC address. + // Job to deprecate the temporary SLAAC address. // // Must not be nil. - deprecationTimer *tcpip.CancellableTimer + deprecationJob *tcpip.Job - // Timer to invalidate the temporary SLAAC address. + // Job to invalidate the temporary SLAAC address. // // Must not be nil. - invalidationTimer *tcpip.CancellableTimer + invalidationJob *tcpip.Job - // Timer to regenerate the temporary SLAAC address. + // Job to regenerate the temporary SLAAC address. // // Must not be nil. - regenTimer *tcpip.CancellableTimer + regenJob *tcpip.Job createdAt time.Time @@ -561,15 +561,15 @@ type tempSLAACAddrState struct { // slaacPrefixState holds state associated with a SLAAC prefix. type slaacPrefixState struct { - // Timer to deprecate the prefix. + // Job to deprecate the prefix. // // Must not be nil. - deprecationTimer *tcpip.CancellableTimer + deprecationJob *tcpip.Job - // Timer to invalidate the prefix. + // Job to invalidate the prefix. // // Must not be nil. - invalidationTimer *tcpip.CancellableTimer + invalidationJob *tcpip.Job // Nonzero only when the address is not valid forever. validUntil time.Time @@ -651,12 +651,12 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref } var done bool - var timer *time.Timer + var timer tcpip.Timer // We initially start a timer to fire immediately because some of the DAD work // cannot be done while holding the NIC's lock. This is effectively the same // as starting a goroutine but we use a timer that fires immediately so we can // reset it for the next DAD iteration. - timer = time.AfterFunc(0, func() { + timer = ndp.nic.stack.Clock().AfterFunc(0, func() { ndp.nic.mu.Lock() defer ndp.nic.mu.Unlock() @@ -871,9 +871,9 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) { case ok && rl != 0: // This is an already discovered default router. Update - // the invalidation timer. - rtr.invalidationTimer.StopLocked() - rtr.invalidationTimer.Reset(rl) + // the invalidation job. + rtr.invalidationJob.Cancel() + rtr.invalidationJob.Schedule(rl) ndp.defaultRouters[ip] = rtr case ok && rl == 0: @@ -950,7 +950,7 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) { return } - rtr.invalidationTimer.StopLocked() + rtr.invalidationJob.Cancel() delete(ndp.defaultRouters, ip) // Let the integrator know a discovered default router is invalidated. @@ -979,12 +979,12 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) { } state := defaultRouterState{ - invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { ndp.invalidateDefaultRouter(ip) }), } - state.invalidationTimer.Reset(rl) + state.invalidationJob.Schedule(rl) ndp.defaultRouters[ip] = state } @@ -1009,13 +1009,13 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) } state := onLinkPrefixState{ - invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { ndp.invalidateOnLinkPrefix(prefix) }), } if l < header.NDPInfiniteLifetime { - state.invalidationTimer.Reset(l) + state.invalidationJob.Schedule(l) } ndp.onLinkPrefixes[prefix] = state @@ -1033,7 +1033,7 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) { return } - s.invalidationTimer.StopLocked() + s.invalidationJob.Cancel() delete(ndp.onLinkPrefixes, prefix) // Let the integrator know a discovered on-link prefix is invalidated. @@ -1082,14 +1082,14 @@ func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformatio // This is an already discovered on-link prefix with a // new non-zero valid lifetime. // - // Update the invalidation timer. + // Update the invalidation job. - prefixState.invalidationTimer.StopLocked() + prefixState.invalidationJob.Cancel() if vl < header.NDPInfiniteLifetime { - // Prefix is valid for a finite lifetime, reset the timer to expire after + // Prefix is valid for a finite lifetime, schedule the job to execute after // the new valid lifetime. - prefixState.invalidationTimer.Reset(vl) + prefixState.invalidationJob.Schedule(vl) } ndp.onLinkPrefixes[prefix] = prefixState @@ -1154,7 +1154,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) { } state := slaacPrefixState{ - deprecationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + deprecationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { state, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the deprecated SLAAC prefix %s", prefix)) @@ -1162,7 +1162,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) { ndp.deprecateSLAACAddress(state.stableAddr.ref) }), - invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { state, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the invalidated SLAAC prefix %s", prefix)) @@ -1184,19 +1184,19 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) { if !ndp.generateSLAACAddr(prefix, &state) { // We were unable to generate an address for the prefix, we do not nothing - // further as there is no reason to maintain state or timers for a prefix we + // further as there is no reason to maintain state or jobs for a prefix we // do not have an address for. return } - // Setup the initial timers to deprecate and invalidate prefix. + // Setup the initial jobs to deprecate and invalidate prefix. if pl < header.NDPInfiniteLifetime && pl != 0 { - state.deprecationTimer.Reset(pl) + state.deprecationJob.Schedule(pl) } if vl < header.NDPInfiniteLifetime { - state.invalidationTimer.Reset(vl) + state.invalidationJob.Schedule(vl) state.validUntil = now.Add(vl) } @@ -1428,7 +1428,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla } state := tempSLAACAddrState{ - deprecationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + deprecationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { prefixState, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to deprecate temporary address %s", prefix, generatedAddr)) @@ -1441,7 +1441,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla ndp.deprecateSLAACAddress(tempAddrState.ref) }), - invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { prefixState, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to invalidate temporary address %s", prefix, generatedAddr)) @@ -1454,7 +1454,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, generatedAddr.Address, tempAddrState) }), - regenTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() { + regenJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() { prefixState, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to regenerate temporary address after %s", prefix, generatedAddr)) @@ -1481,9 +1481,9 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla ref: ref, } - state.deprecationTimer.Reset(pl) - state.invalidationTimer.Reset(vl) - state.regenTimer.Reset(pl - ndp.configs.RegenAdvanceDuration) + state.deprecationJob.Schedule(pl) + state.invalidationJob.Schedule(vl) + state.regenJob.Schedule(pl - ndp.configs.RegenAdvanceDuration) prefixState.generationAttempts++ prefixState.tempAddrs[generatedAddr.Address] = state @@ -1518,16 +1518,16 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat prefixState.stableAddr.ref.deprecated = false } - // If prefix was preferred for some finite lifetime before, stop the - // deprecation timer so it can be reset. - prefixState.deprecationTimer.StopLocked() + // If prefix was preferred for some finite lifetime before, cancel the + // deprecation job so it can be reset. + prefixState.deprecationJob.Cancel() now := time.Now() - // Reset the deprecation timer if prefix has a finite preferred lifetime. + // Schedule the deprecation job if prefix has a finite preferred lifetime. if pl < header.NDPInfiniteLifetime { if !deprecated { - prefixState.deprecationTimer.Reset(pl) + prefixState.deprecationJob.Schedule(pl) } prefixState.preferredUntil = now.Add(pl) } else { @@ -1546,9 +1546,9 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat // 3) Otherwise, reset the valid lifetime of the prefix to 2 hours. if vl >= header.NDPInfiniteLifetime { - // Handle the infinite valid lifetime separately as we do not keep a timer - // in this case. - prefixState.invalidationTimer.StopLocked() + // Handle the infinite valid lifetime separately as we do not schedule a + // job in this case. + prefixState.invalidationJob.Cancel() prefixState.validUntil = time.Time{} } else { var effectiveVl time.Duration @@ -1569,8 +1569,8 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat } if effectiveVl != 0 { - prefixState.invalidationTimer.StopLocked() - prefixState.invalidationTimer.Reset(effectiveVl) + prefixState.invalidationJob.Cancel() + prefixState.invalidationJob.Schedule(effectiveVl) prefixState.validUntil = now.Add(effectiveVl) } } @@ -1582,7 +1582,7 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat } // Note, we do not need to update the entries in the temporary address map - // after updating the timers because the timers are held as pointers. + // after updating the jobs because the jobs are held as pointers. var regenForAddr tcpip.Address allAddressesRegenerated := true for tempAddr, tempAddrState := range prefixState.tempAddrs { @@ -1596,14 +1596,14 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat } // If the address is no longer valid, invalidate it immediately. Otherwise, - // reset the invalidation timer. + // reset the invalidation job. newValidLifetime := validUntil.Sub(now) if newValidLifetime <= 0 { ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, tempAddr, tempAddrState) continue } - tempAddrState.invalidationTimer.StopLocked() - tempAddrState.invalidationTimer.Reset(newValidLifetime) + tempAddrState.invalidationJob.Cancel() + tempAddrState.invalidationJob.Schedule(newValidLifetime) // As per RFC 4941 section 3.3 step 4, the preferred lifetime of a temporary // address is the lower of the preferred lifetime of the stable address or @@ -1616,17 +1616,17 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat } // If the address is no longer preferred, deprecate it immediately. - // Otherwise, reset the deprecation timer. + // Otherwise, schedule the deprecation job again. newPreferredLifetime := preferredUntil.Sub(now) - tempAddrState.deprecationTimer.StopLocked() + tempAddrState.deprecationJob.Cancel() if newPreferredLifetime <= 0 { ndp.deprecateSLAACAddress(tempAddrState.ref) } else { tempAddrState.ref.deprecated = false - tempAddrState.deprecationTimer.Reset(newPreferredLifetime) + tempAddrState.deprecationJob.Schedule(newPreferredLifetime) } - tempAddrState.regenTimer.StopLocked() + tempAddrState.regenJob.Cancel() if tempAddrState.regenerated { } else { allAddressesRegenerated = false @@ -1637,7 +1637,7 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat // immediately after we finish iterating over the temporary addresses. regenForAddr = tempAddr } else { - tempAddrState.regenTimer.Reset(newPreferredLifetime - ndp.configs.RegenAdvanceDuration) + tempAddrState.regenJob.Schedule(newPreferredLifetime - ndp.configs.RegenAdvanceDuration) } } } @@ -1717,7 +1717,7 @@ func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPr ndp.cleanupSLAACPrefixResources(prefix, state) } -// cleanupSLAACPrefixResources cleansup a SLAAC prefix's timers and entry. +// cleanupSLAACPrefixResources cleans up a SLAAC prefix's jobs and entry. // // Panics if the SLAAC prefix is not known. // @@ -1729,8 +1729,8 @@ func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaa } state.stableAddr.ref = nil - state.deprecationTimer.StopLocked() - state.invalidationTimer.StopLocked() + state.deprecationJob.Cancel() + state.invalidationJob.Cancel() delete(ndp.slaacPrefixes, prefix) } @@ -1775,13 +1775,13 @@ func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWi } // cleanupTempSLAACAddrResourcesAndNotify cleans up a temporary SLAAC address's -// timers and entry. +// jobs and entry. // // The NIC that ndp belongs to MUST be locked. func (ndp *ndpState) cleanupTempSLAACAddrResources(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) { - tempAddrState.deprecationTimer.StopLocked() - tempAddrState.invalidationTimer.StopLocked() - tempAddrState.regenTimer.StopLocked() + tempAddrState.deprecationJob.Cancel() + tempAddrState.invalidationJob.Cancel() + tempAddrState.regenJob.Cancel() delete(tempAddrs, tempAddr) } @@ -1860,7 +1860,7 @@ func (ndp *ndpState) startSolicitingRouters() { var done bool ndp.rtrSolicit.done = &done - ndp.rtrSolicit.timer = time.AfterFunc(delay, func() { + ndp.rtrSolicit.timer = ndp.nic.stack.Clock().AfterFunc(delay, func() { ndp.nic.mu.Lock() if done { // If we reach this point, it means that the RS timer fired after another diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 6f86abc98..644ba7c33 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -1254,7 +1254,7 @@ func TestRouterDiscovery(t *testing.T) { default: } - // Wait for lladdr2's router invalidation timer to fire. The lifetime + // Wait for lladdr2's router invalidation job to execute. The lifetime // of the router should have been updated to the most recent (smaller) // lifetime. // @@ -1271,7 +1271,7 @@ func TestRouterDiscovery(t *testing.T) { e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0)) expectRouterEvent(llAddr2, false) - // Wait for lladdr3's router invalidation timer to fire. The lifetime + // Wait for lladdr3's router invalidation job to execute. The lifetime // of the router should have been updated to the most recent (smaller) // lifetime. // @@ -1502,7 +1502,7 @@ func TestPrefixDiscovery(t *testing.T) { default: } - // Wait for prefix2's most recent invalidation timer plus some buffer to + // Wait for prefix2's most recent invalidation job plus some buffer to // expire. select { case e := <-ndpDisp.prefixC: @@ -2395,7 +2395,7 @@ func TestAutoGenTempAddrRegen(t *testing.T) { for _, addr := range tempAddrs { // Wait for a deprecation then invalidation event, or just an invalidation // event. We need to cover both cases but cannot deterministically hit both - // cases because the deprecation and invalidation timers could fire in any + // cases because the deprecation and invalidation jobs could execute in any // order. select { case e := <-ndpDisp.autoGenAddrC: @@ -2432,9 +2432,9 @@ func TestAutoGenTempAddrRegen(t *testing.T) { } } -// TestAutoGenTempAddrRegenTimerUpdates tests that a temporary address's -// regeneration timer gets updated when refreshing the address's lifetimes. -func TestAutoGenTempAddrRegenTimerUpdates(t *testing.T) { +// TestAutoGenTempAddrRegenJobUpdates tests that a temporary address's +// regeneration job gets updated when refreshing the address's lifetimes. +func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) { const ( nicID = 1 regenAfter = 2 * time.Second @@ -2533,7 +2533,7 @@ func TestAutoGenTempAddrRegenTimerUpdates(t *testing.T) { // // A new temporary address should immediately be generated since the // regeneration time has already passed since the last address was generated - // - this regeneration does not depend on a timer. + // - this regeneration does not depend on a job. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100)) expectAutoGenAddrEvent(tempAddr2, newAddr) @@ -2559,11 +2559,11 @@ func TestAutoGenTempAddrRegenTimerUpdates(t *testing.T) { } // Set the maximum lifetimes for temporary addresses such that on the next - // RA, the regeneration timer gets reset. + // RA, the regeneration job gets scheduled again. // // The maximum lifetime is the sum of the minimum lifetimes for temporary // addresses + the time that has already passed since the last address was - // generated so that the regeneration timer is needed to generate the next + // generated so that the regeneration job is needed to generate the next // address. newLifetimes := newMinVLDuration + regenAfter + defaultAsyncNegativeEventTimeout ndpConfigs.MaxTempAddrValidLifetime = newLifetimes @@ -2993,9 +2993,9 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) { expectPrimaryAddr(addr2) } -// TestAutoGenAddrTimerDeprecation tests that an address is properly deprecated +// TestAutoGenAddrJobDeprecation tests that an address is properly deprecated // when its preferred lifetime expires. -func TestAutoGenAddrTimerDeprecation(t *testing.T) { +func TestAutoGenAddrJobDeprecation(t *testing.T) { const nicID = 1 const newMinVL = 2 newMinVLDuration := newMinVL * time.Second @@ -3513,8 +3513,8 @@ func TestAutoGenAddrRemoval(t *testing.T) { } expectAutoGenAddrEvent(addr, invalidatedAddr) - // Wait for the original valid lifetime to make sure the original timer - // got stopped/cleaned up. + // Wait for the original valid lifetime to make sure the original job got + // cancelled/cleaned up. select { case <-ndpDisp.autoGenAddrC: t.Fatal("unexpectedly received an auto gen addr event") diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 7b80534e6..fea0ce7e8 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -1200,15 +1200,13 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp // Are any packet sockets listening for this network protocol? packetEPs := n.mu.packetEPs[protocol] - // Check whether there are packet sockets listening for every protocol. - // If we received a packet with protocol EthernetProtocolAll, then the - // previous for loop will have handled it. - if protocol != header.EthernetProtocolAll { - packetEPs = append(packetEPs, n.mu.packetEPs[header.EthernetProtocolAll]...) - } + // Add any other packet sockets that maybe listening for all protocols. + packetEPs = append(packetEPs, n.mu.packetEPs[header.EthernetProtocolAll]...) n.mu.RUnlock() for _, ep := range packetEPs { - ep.HandlePacket(n.id, local, protocol, pkt.Clone()) + p := pkt.Clone() + p.PktType = tcpip.PacketHost + ep.HandlePacket(n.id, local, protocol, p) } if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber { @@ -1311,6 +1309,24 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp } } +// DeliverOutboundPacket implements NetworkDispatcher.DeliverOutboundPacket. +func (n *NIC) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) { + n.mu.RLock() + // We do not deliver to protocol specific packet endpoints as on Linux + // only ETH_P_ALL endpoints get outbound packets. + // Add any other packet sockets that maybe listening for all protocols. + packetEPs := n.mu.packetEPs[header.EthernetProtocolAll] + n.mu.RUnlock() + for _, ep := range packetEPs { + p := pkt.Clone() + p.PktType = tcpip.PacketOutgoing + // Add the link layer header as outgoing packets are intercepted + // before the link layer header is created. + n.linkEP.AddHeader(local, remote, protocol, p) + ep.HandlePacket(n.id, local, protocol, p) + } +} + func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) { // TODO(b/143425874) Decrease the TTL field in forwarded packets. // TODO(b/151227689): Avoid copying the packet when forwarding. We can do this diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go index 3bc9fd831..a70792b50 100644 --- a/pkg/tcpip/stack/nic_test.go +++ b/pkg/tcpip/stack/nic_test.go @@ -89,6 +89,11 @@ func (*testLinkEndpoint) ARPHardwareType() header.ARPHardwareType { panic("not implemented") } +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (e *testLinkEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) { + panic("not implemented") +} + var _ NetworkEndpoint = (*testIPv6Endpoint)(nil) // An IPv6 NetworkEndpoint that throws away outgoing packets. @@ -238,7 +243,7 @@ func (*testIPv6Protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { } // LinkAddressRequest implements LinkAddressResolver. -func (*testIPv6Protocol) LinkAddressRequest(_, _ tcpip.Address, _ LinkEndpoint) *tcpip.Error { +func (*testIPv6Protocol) LinkAddressRequest(_, _ tcpip.Address, _ tcpip.LinkAddress, _ LinkEndpoint) *tcpip.Error { return nil } diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go index e3556d5d2..5d6865e35 100644 --- a/pkg/tcpip/stack/packet_buffer.go +++ b/pkg/tcpip/stack/packet_buffer.go @@ -79,6 +79,10 @@ type PacketBuffer struct { // NatDone indicates if the packet has been manipulated as per NAT // iptables rule. NatDone bool + + // PktType indicates the SockAddrLink.PacketType of the packet as defined in + // https://www.man7.org/linux/man-pages/man7/packet.7.html. + PktType tcpip.PacketType } // Clone makes a copy of pk. It clones the Data field, which creates a new diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go index f260eeb7f..8604c4259 100644 --- a/pkg/tcpip/stack/registration.go +++ b/pkg/tcpip/stack/registration.go @@ -52,8 +52,11 @@ type TransportEndpointID struct { type ControlType int // The following are the allowed values for ControlType values. +// TODO(http://gvisor.dev/issue/3210): Support time exceeded messages. const ( - ControlPacketTooBig ControlType = iota + ControlNetworkUnreachable ControlType = iota + ControlNoRoute + ControlPacketTooBig ControlPortUnreachable ControlUnknown ) @@ -330,8 +333,7 @@ type NetworkProtocol interface { } // NetworkDispatcher contains the methods used by the network stack to deliver -// packets to the appropriate network endpoint after it has been handled by -// the data link layer. +// inbound/outbound packets to the appropriate network/packet(if any) endpoints. type NetworkDispatcher interface { // DeliverNetworkPacket finds the appropriate network protocol endpoint // and hands the packet over for further processing. @@ -342,6 +344,16 @@ type NetworkDispatcher interface { // // DeliverNetworkPacket takes ownership of pkt. DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) + + // DeliverOutboundPacket is called by link layer when a packet is being + // sent out. + // + // pkt.LinkHeader may or may not be set before calling + // DeliverOutboundPacket. Some packets do not have link headers (e.g. + // packets sent via loopback), and won't have the field set. + // + // DeliverOutboundPacket takes ownership of pkt. + DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) } // LinkEndpointCapabilities is the type associated with the capabilities @@ -443,6 +455,9 @@ type LinkEndpoint interface { // See: // https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/include/uapi/linux/if_arp.h#L30 ARPHardwareType() header.ARPHardwareType + + // AddHeader adds a link layer header to pkt if required. + AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) } // InjectableLinkEndpoint is a LinkEndpoint where inbound packets are @@ -463,12 +478,13 @@ type InjectableLinkEndpoint interface { // A LinkAddressResolver is an extension to a NetworkProtocol that // can resolve link addresses. type LinkAddressResolver interface { - // LinkAddressRequest sends a request for the LinkAddress of addr. - // The request is sent on linkEP with localAddr as the source. + // LinkAddressRequest sends a request for the LinkAddress of addr. Broadcasts + // the request on the local network if remoteLinkAddr is the zero value. The + // request is sent on linkEP with localAddr as the source. // // A valid response will cause the discovery protocol's network // endpoint to call AddLinkAddress. - LinkAddressRequest(addr, localAddr tcpip.Address, linkEP LinkEndpoint) *tcpip.Error + LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error // ResolveStaticAddress attempts to resolve address without sending // requests. It either resolves the name immediately or returns the diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 2b7ece851..a6faa22c2 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -728,6 +728,11 @@ func New(opts Options) *Stack { return s } +// newJob returns a tcpip.Job using the Stack clock. +func (s *Stack) newJob(l sync.Locker, f func()) *tcpip.Job { + return tcpip.NewJob(s.clock, l, f) +} + // UniqueID returns a unique identifier. func (s *Stack) UniqueID() uint64 { return s.uniqueIDGenerator.UniqueID() @@ -801,9 +806,10 @@ func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h f } } -// NowNanoseconds implements tcpip.Clock.NowNanoseconds. -func (s *Stack) NowNanoseconds() int64 { - return s.clock.NowNanoseconds() +// Clock returns the Stack's clock for retrieving the current time and +// scheduling work. +func (s *Stack) Clock() tcpip.Clock { + return s.clock } // Stats returns a mutable copy of the current stats. diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 48ad56d4d..21aafb0a2 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -192,7 +192,7 @@ func (e ErrSaveRejection) Error() string { return "save rejected due to unsupported networking state: " + e.Err.Error() } -// A Clock provides the current time. +// A Clock provides the current time and schedules work for execution. // // Times returned by a Clock should always be used for application-visible // time. Only monotonic times should be used for netstack internal timekeeping. @@ -203,6 +203,31 @@ type Clock interface { // NowMonotonic returns a monotonic time value. NowMonotonic() int64 + + // AfterFunc waits for the duration to elapse and then calls f in its own + // goroutine. It returns a Timer that can be used to cancel the call using + // its Stop method. + AfterFunc(d time.Duration, f func()) Timer +} + +// Timer represents a single event. A Timer must be created with +// Clock.AfterFunc. +type Timer interface { + // Stop prevents the Timer from firing. It returns true if the call stops the + // timer, false if the timer has already expired or been stopped. + // + // If Stop returns false, then the timer has already expired and the function + // f of Clock.AfterFunc(d, f) has been started in its own goroutine; Stop + // does not wait for f to complete before returning. If the caller needs to + // know whether f is completed, it must coordinate with f explicitly. + Stop() bool + + // Reset changes the timer to expire after duration d. + // + // Reset should be invoked only on stopped or expired timers. If the timer is + // known to have expired, Reset can be used directly. Otherwise, the caller + // must coordinate with the function f of Clock.AfterFunc(d, f). + Reset(d time.Duration) } // Address is a byte slice cast as a string that represents the address of a @@ -316,6 +341,28 @@ const ( ShutdownWrite ) +// PacketType is used to indicate the destination of the packet. +type PacketType uint8 + +const ( + // PacketHost indicates a packet addressed to the local host. + PacketHost PacketType = iota + + // PacketOtherHost indicates an outgoing packet addressed to + // another host caught by a NIC in promiscuous mode. + PacketOtherHost + + // PacketOutgoing for a packet originating from the local host + // that is looped back to a packet socket. + PacketOutgoing + + // PacketBroadcast indicates a link layer broadcast packet. + PacketBroadcast + + // PacketMulticast indicates a link layer multicast packet. + PacketMulticast +) + // FullAddress represents a full transport node address, as required by the // Connect() and Bind() methods. // @@ -555,6 +602,9 @@ type Endpoint interface { type LinkPacketInfo struct { // Protocol is the NetworkProtocolNumber for the packet. Protocol NetworkProtocolNumber + + // PktType is used to indicate the destination of the packet. + PktType PacketType } // PacketEndpoint are additional methods that are only implemented by Packet diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go index 7f172f978..f32d58091 100644 --- a/pkg/tcpip/time_unsafe.go +++ b/pkg/tcpip/time_unsafe.go @@ -20,7 +20,7 @@ package tcpip import ( - _ "time" // Used with go:linkname. + "time" // Used with go:linkname. _ "unsafe" // Required for go:linkname. ) @@ -45,3 +45,31 @@ func (*StdClock) NowMonotonic() int64 { _, _, mono := now() return mono } + +// AfterFunc implements Clock.AfterFunc. +func (*StdClock) AfterFunc(d time.Duration, f func()) Timer { + return &stdTimer{ + t: time.AfterFunc(d, f), + } +} + +type stdTimer struct { + t *time.Timer +} + +var _ Timer = (*stdTimer)(nil) + +// Stop implements Timer.Stop. +func (st *stdTimer) Stop() bool { + return st.t.Stop() +} + +// Reset implements Timer.Reset. +func (st *stdTimer) Reset(d time.Duration) { + st.t.Reset(d) +} + +// NewStdTimer returns a Timer implemented with the time package. +func NewStdTimer(t *time.Timer) Timer { + return &stdTimer{t: t} +} diff --git a/pkg/tcpip/timer.go b/pkg/tcpip/timer.go index 5554c573f..f1dd7c310 100644 --- a/pkg/tcpip/timer.go +++ b/pkg/tcpip/timer.go @@ -20,50 +20,49 @@ import ( "gvisor.dev/gvisor/pkg/sync" ) -// cancellableTimerInstance is a specific instance of CancellableTimer. +// jobInstance is a specific instance of Job. // -// Different instances are created each time CancellableTimer is Reset so each -// timer has its own earlyReturn signal. This is to address a bug when a -// CancellableTimer is stopped and reset in quick succession resulting in a -// timer instance's earlyReturn signal being affected or seen by another timer -// instance. +// Different instances are created each time Job is scheduled so each timer has +// its own earlyReturn signal. This is to address a bug when a Job is stopped +// and reset in quick succession resulting in a timer instance's earlyReturn +// signal being affected or seen by another timer instance. // // Consider the following sceneario where timer instances share a common // earlyReturn signal (T1 creates, stops and resets a Cancellable timer under a // lock L; T2, T3, T4 and T5 are goroutines that handle the first (A), second // (B), third (C), and fourth (D) instance of the timer firing, respectively): // T1: Obtain L -// T1: Create a new CancellableTimer w/ lock L (create instance A) +// T1: Create a new Job w/ lock L (create instance A) // T2: instance A fires, blocked trying to obtain L. // T1: Attempt to stop instance A (set earlyReturn = true) -// T1: Reset timer (create instance B) +// T1: Schedule timer (create instance B) // T3: instance B fires, blocked trying to obtain L. // T1: Attempt to stop instance B (set earlyReturn = true) -// T1: Reset timer (create instance C) +// T1: Schedule timer (create instance C) // T4: instance C fires, blocked trying to obtain L. // T1: Attempt to stop instance C (set earlyReturn = true) -// T1: Reset timer (create instance D) +// T1: Schedule timer (create instance D) // T5: instance D fires, blocked trying to obtain L. // T1: Release L // -// Now that T1 has released L, any of the 4 timer instances can take L and check -// earlyReturn. If the timers simply check earlyReturn and then do nothing -// further, then instance D will never early return even though it was not -// requested to stop. If the timers reset earlyReturn before early returning, -// then all but one of the timers will do work when only one was expected to. -// If CancellableTimer resets earlyReturn when resetting, then all the timers +// Now that T1 has released L, any of the 4 timer instances can take L and +// check earlyReturn. If the timers simply check earlyReturn and then do +// nothing further, then instance D will never early return even though it was +// not requested to stop. If the timers reset earlyReturn before early +// returning, then all but one of the timers will do work when only one was +// expected to. If Job resets earlyReturn when resetting, then all the timers // will fire (again, when only one was expected to). // // To address the above concerns the simplest solution was to give each timer // its own earlyReturn signal. -type cancellableTimerInstance struct { - timer *time.Timer +type jobInstance struct { + timer Timer // Used to inform the timer to early return when it gets stopped while the // lock the timer tries to obtain when fired is held (T1 is a goroutine that // tries to cancel the timer and T2 is the goroutine that handles the timer // firing): - // T1: Obtain the lock, then call StopLocked() + // T1: Obtain the lock, then call Cancel() // T2: timer fires, and gets blocked on obtaining the lock // T1: Releases lock // T2: Obtains lock does unintended work @@ -74,29 +73,33 @@ type cancellableTimerInstance struct { earlyReturn *bool } -// stop stops the timer instance t from firing if it hasn't fired already. If it +// stop stops the job instance j from firing if it hasn't fired already. If it // has fired and is blocked at obtaining the lock, earlyReturn will be set to // true so that it will early return when it obtains the lock. -func (t *cancellableTimerInstance) stop() { - if t.timer != nil { - t.timer.Stop() - *t.earlyReturn = true +func (j *jobInstance) stop() { + if j.timer != nil { + j.timer.Stop() + *j.earlyReturn = true } } -// CancellableTimer is a timer that does some work and can be safely cancelled -// when it fires at the same time some "related work" is being done. +// Job represents some work that can be scheduled for execution. The work can +// be safely cancelled when it fires at the same time some "related work" is +// being done. // // The term "related work" is defined as some work that needs to be done while // holding some lock that the timer must also hold while doing some work. // -// Note, it is not safe to copy a CancellableTimer as its timer instance creates -// a closure over the address of the CancellableTimer. -type CancellableTimer struct { +// Note, it is not safe to copy a Job as its timer instance creates +// a closure over the address of the Job. +type Job struct { _ sync.NoCopy + // The clock used to schedule the backing timer + clock Clock + // The active instance of a cancellable timer. - instance cancellableTimerInstance + instance jobInstance // locker is the lock taken by the timer immediately after it fires and must // be held when attempting to stop the timer. @@ -113,59 +116,91 @@ type CancellableTimer struct { fn func() } -// StopLocked prevents the Timer from firing if it has not fired already. +// Cancel prevents the Job from executing if it has not executed already. // -// If the timer is blocked on obtaining the t.locker lock when StopLocked is -// called, it will early return instead of calling t.fn. +// Cancel requires appropriate locking to be in place for any resources managed +// by the Job. If the Job is blocked on obtaining the lock when Cancel is +// called, it will early return. // // Note, t will be modified. // -// t.locker MUST be locked. -func (t *CancellableTimer) StopLocked() { - t.instance.stop() +// j.locker MUST be locked. +func (j *Job) Cancel() { + j.instance.stop() // Nothing to do with the stopped instance anymore. - t.instance = cancellableTimerInstance{} + j.instance = jobInstance{} } -// Reset changes the timer to expire after duration d. +// Schedule schedules the Job for execution after duration d. This can be +// called on cancelled or completed Jobs to schedule them again. // -// Note, t will be modified. +// Schedule should be invoked only on unscheduled, cancelled, or completed +// Jobs. To be safe, callers should always call Cancel before calling Schedule. // -// Reset should only be called on stopped or expired timers. To be safe, callers -// should always call StopLocked before calling Reset. -func (t *CancellableTimer) Reset(d time.Duration) { +// Note, j will be modified. +func (j *Job) Schedule(d time.Duration) { // Create a new instance. earlyReturn := false // Capture the locker so that updating the timer does not cause a data race // when a timer fires and tries to obtain the lock (read the timer's locker). - locker := t.locker - t.instance = cancellableTimerInstance{ - timer: time.AfterFunc(d, func() { + locker := j.locker + j.instance = jobInstance{ + timer: j.clock.AfterFunc(d, func() { locker.Lock() defer locker.Unlock() if earlyReturn { // If we reach this point, it means that the timer fired while another - // goroutine called StopLocked while it had the lock. Simply return - // here and do nothing further. + // goroutine called Cancel while it had the lock. Simply return here + // and do nothing further. earlyReturn = false return } - t.fn() + j.fn() }), earlyReturn: &earlyReturn, } } -// NewCancellableTimer returns an unscheduled CancellableTimer with the given -// locker and fn. -// -// fn MUST NOT attempt to lock locker. -// -// Callers must call Reset to schedule the timer to fire. -func NewCancellableTimer(locker sync.Locker, fn func()) *CancellableTimer { - return &CancellableTimer{locker: locker, fn: fn} +// NewJob returns a new Job that can be used to schedule f to run in its own +// gorountine. l will be locked before calling f then unlocked after f returns. +// +// var clock tcpip.StdClock +// var mu sync.Mutex +// message := "foo" +// job := tcpip.NewJob(&clock, &mu, func() { +// fmt.Println(message) +// }) +// job.Schedule(time.Second) +// +// mu.Lock() +// message = "bar" +// mu.Unlock() +// +// // Output: bar +// +// f MUST NOT attempt to lock l. +// +// l MUST be locked prior to calling the returned job's Cancel(). +// +// var clock tcpip.StdClock +// var mu sync.Mutex +// message := "foo" +// job := tcpip.NewJob(&clock, &mu, func() { +// fmt.Println(message) +// }) +// job.Schedule(time.Second) +// +// mu.Lock() +// job.Cancel() +// mu.Unlock() +func NewJob(c Clock, l sync.Locker, f func()) *Job { + return &Job{ + clock: c, + locker: l, + fn: f, + } } diff --git a/pkg/tcpip/timer_test.go b/pkg/tcpip/timer_test.go index b4940e397..a82384c49 100644 --- a/pkg/tcpip/timer_test.go +++ b/pkg/tcpip/timer_test.go @@ -28,8 +28,8 @@ const ( longDuration = 1 * time.Second ) -func TestCancellableTimerReassignment(t *testing.T) { - var timer tcpip.CancellableTimer +func TestJobReschedule(t *testing.T) { + var clock tcpip.StdClock var wg sync.WaitGroup var lock sync.Mutex @@ -43,26 +43,27 @@ func TestCancellableTimerReassignment(t *testing.T) { // that has an active timer (even if it has been stopped as a stopped // timer may be blocked on a lock before it can check if it has been // stopped while another goroutine holds the same lock). - timer = *tcpip.NewCancellableTimer(&lock, func() { + job := tcpip.NewJob(&clock, &lock, func() { wg.Done() }) - timer.Reset(shortDuration) + job.Schedule(shortDuration) lock.Unlock() }() } wg.Wait() } -func TestCancellableTimerFire(t *testing.T) { +func TestJobExecution(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) - timer := tcpip.NewCancellableTimer(&lock, func() { + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) + job.Schedule(shortDuration) // Wait for timer to fire. select { @@ -82,17 +83,18 @@ func TestCancellableTimerFire(t *testing.T) { func TestCancellableTimerResetFromLongDuration(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(middleDuration) + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(middleDuration) lock.Lock() - timer.StopLocked() + job.Cancel() lock.Unlock() - timer.Reset(shortDuration) + job.Schedule(shortDuration) // Wait for timer to fire. select { @@ -109,16 +111,17 @@ func TestCancellableTimerResetFromLongDuration(t *testing.T) { } } -func TestCancellableTimerResetFromShortDuration(t *testing.T) { +func TestJobRescheduleFromShortDuration(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) lock.Lock() - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) - timer.StopLocked() + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(shortDuration) + job.Cancel() lock.Unlock() // Wait for timer to fire if it wasn't correctly stopped. @@ -128,7 +131,7 @@ func TestCancellableTimerResetFromShortDuration(t *testing.T) { case <-time.After(middleDuration): } - timer.Reset(shortDuration) + job.Schedule(shortDuration) // Wait for timer to fire. select { @@ -145,17 +148,18 @@ func TestCancellableTimerResetFromShortDuration(t *testing.T) { } } -func TestCancellableTimerImmediatelyStop(t *testing.T) { +func TestJobImmediatelyCancel(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) for i := 0; i < 1000; i++ { lock.Lock() - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) - timer.StopLocked() + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(shortDuration) + job.Cancel() lock.Unlock() } @@ -167,25 +171,26 @@ func TestCancellableTimerImmediatelyStop(t *testing.T) { } } -func TestCancellableTimerStoppedResetWithoutLock(t *testing.T) { +func TestJobCancelledRescheduleWithoutLock(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) lock.Lock() - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) - timer.StopLocked() + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(shortDuration) + job.Cancel() lock.Unlock() for i := 0; i < 10; i++ { - timer.Reset(middleDuration) + job.Schedule(middleDuration) lock.Lock() // Sleep until the timer fires and gets blocked trying to take the lock. time.Sleep(middleDuration * 2) - timer.StopLocked() + job.Cancel() lock.Unlock() } @@ -201,17 +206,18 @@ func TestCancellableTimerStoppedResetWithoutLock(t *testing.T) { func TestManyCancellableTimerResetAfterBlockedOnLock(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) lock.Lock() - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(shortDuration) for i := 0; i < 10; i++ { // Sleep until the timer fires and gets blocked trying to take the lock. time.Sleep(middleDuration) - timer.StopLocked() - timer.Reset(shortDuration) + job.Cancel() + job.Schedule(shortDuration) } lock.Unlock() @@ -230,18 +236,19 @@ func TestManyCancellableTimerResetAfterBlockedOnLock(t *testing.T) { } } -func TestManyCancellableTimerResetUnderLock(t *testing.T) { +func TestManyJobReschedulesUnderLock(t *testing.T) { t.Parallel() - ch := make(chan struct{}) + var clock tcpip.StdClock var lock sync.Mutex + ch := make(chan struct{}) lock.Lock() - timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} }) - timer.Reset(shortDuration) + job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} }) + job.Schedule(shortDuration) for i := 0; i < 10; i++ { - timer.StopLocked() - timer.Reset(shortDuration) + job.Cancel() + job.Schedule(shortDuration) } lock.Unlock() diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index 678f4e016..4612be4e7 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -797,7 +797,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk e.rcvList.PushBack(packet) e.rcvBufSize += packet.data.Size() - packet.timestamp = e.stack.NowNanoseconds() + packet.timestamp = e.stack.Clock().NowNanoseconds() e.rcvMu.Unlock() e.stats.PacketsReceived.Increment() diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 7b2083a09..0e46e6355 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -441,6 +441,7 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, Addr: tcpip.Address(hdr.SourceAddress()), } packet.packetInfo.Protocol = netProto + packet.packetInfo.PktType = pkt.PktType } else { // Guess the would-be ethernet header. packet.senderAddr = tcpip.FullAddress{ @@ -448,34 +449,57 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, Addr: tcpip.Address(localAddr), } packet.packetInfo.Protocol = netProto + packet.packetInfo.PktType = pkt.PktType } if ep.cooked { // Cooked packets can simply be queued. - packet.data = pkt.Data + switch pkt.PktType { + case tcpip.PacketHost: + packet.data = pkt.Data + case tcpip.PacketOutgoing: + // Strip Link Header from the Header. + pkt.Header = buffer.NewPrependableFromView(pkt.Header.View()[len(pkt.LinkHeader):]) + combinedVV := pkt.Header.View().ToVectorisedView() + combinedVV.Append(pkt.Data) + packet.data = combinedVV + default: + panic(fmt.Sprintf("unexpected PktType in pkt: %+v", pkt)) + } + } else { // Raw packets need their ethernet headers prepended before // queueing. var linkHeader buffer.View - if len(pkt.LinkHeader) == 0 { - // We weren't provided with an actual ethernet header, - // so fake one. - ethFields := header.EthernetFields{ - SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), - DstAddr: localAddr, - Type: netProto, + var combinedVV buffer.VectorisedView + if pkt.PktType != tcpip.PacketOutgoing { + if len(pkt.LinkHeader) == 0 { + // We weren't provided with an actual ethernet header, + // so fake one. + ethFields := header.EthernetFields{ + SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), + DstAddr: localAddr, + Type: netProto, + } + fakeHeader := make(header.Ethernet, header.EthernetMinimumSize) + fakeHeader.Encode(ðFields) + linkHeader = buffer.View(fakeHeader) + } else { + linkHeader = append(buffer.View(nil), pkt.LinkHeader...) } - fakeHeader := make(header.Ethernet, header.EthernetMinimumSize) - fakeHeader.Encode(ðFields) - linkHeader = buffer.View(fakeHeader) - } else { - linkHeader = append(buffer.View(nil), pkt.LinkHeader...) + combinedVV = linkHeader.ToVectorisedView() + } + if pkt.PktType == tcpip.PacketOutgoing { + // For outgoing packets the Link, Network and Transport + // headers are in the pkt.Header fields normally unless + // a Raw socket is in use in which case pkt.Header could + // be nil. + combinedVV.AppendView(pkt.Header.View()) } - combinedVV := linkHeader.ToVectorisedView() combinedVV.Append(pkt.Data) packet.data = combinedVV } - packet.timestampNS = ep.stack.NowNanoseconds() + packet.timestampNS = ep.stack.Clock().NowNanoseconds() ep.rcvList.PushBack(&packet) ep.rcvBufSize += packet.data.Size() diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index c2e9fd29f..f85a68554 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -456,7 +456,7 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error { defer e.mu.Unlock() // If a local address was specified, verify that it's valid. - if e.stack.CheckLocalAddress(addr.NIC, e.NetProto, addr.Addr) == 0 { + if len(addr.Addr) != 0 && e.stack.CheckLocalAddress(addr.NIC, e.NetProto, addr.Addr) == 0 { return tcpip.ErrBadLocalAddress } @@ -700,7 +700,7 @@ func (e *endpoint) HandlePacket(route *stack.Route, pkt *stack.PacketBuffer) { } combinedVV.Append(pkt.Data) packet.data = combinedVV - packet.timestampNS = e.stack.NowNanoseconds() + packet.timestampNS = e.stack.Clock().NowNanoseconds() e.rcvList.PushBack(packet) e.rcvBufSize += packet.data.Size() diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 18ff89ffc..e860ee484 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -49,6 +49,7 @@ go_library( "segment_heap.go", "segment_queue.go", "segment_state.go", + "segment_unsafe.go", "snd.go", "snd_state.go", "tcp_endpoint_list.go", diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 81b740115..1798510bc 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -490,6 +490,9 @@ func (h *handshake) resolveRoute() *tcpip.Error { <-h.ep.undrain h.ep.mu.Lock() } + if n¬ifyError != 0 { + return h.ep.takeLastError() + } } // Wait for notification. @@ -616,6 +619,9 @@ func (h *handshake) execute() *tcpip.Error { <-h.ep.undrain h.ep.mu.Lock() } + if n¬ifyError != 0 { + return h.ep.takeLastError() + } case wakerForNewSegment: if err := h.processSegments(); err != nil { diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 83dc10ed0..0f7487963 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1209,6 +1209,14 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { e.owner = owner } +func (e *endpoint) takeLastError() *tcpip.Error { + e.lastErrorMu.Lock() + defer e.lastErrorMu.Unlock() + err := e.lastError + e.lastError = nil + return err +} + // Read reads data from the endpoint. func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { e.LockUser() @@ -1956,11 +1964,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { switch o := opt.(type) { case tcpip.ErrorOption: - e.lastErrorMu.Lock() - err := e.lastError - e.lastError = nil - e.lastErrorMu.Unlock() - return err + return e.takeLastError() case *tcpip.BindToDeviceOption: e.LockUser() @@ -2546,6 +2550,18 @@ func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.C e.sndBufMu.Unlock() e.notifyProtocolGoroutine(notifyMTUChanged) + + case stack.ControlNoRoute: + e.lastErrorMu.Lock() + e.lastError = tcpip.ErrNoRoute + e.lastErrorMu.Unlock() + e.notifyProtocolGoroutine(notifyError) + + case stack.ControlNetworkUnreachable: + e.lastErrorMu.Lock() + e.lastError = tcpip.ErrNetworkUnreachable + e.lastErrorMu.Unlock() + e.notifyProtocolGoroutine(notifyError) } } diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index b34e47bbd..5d6174a59 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -49,7 +49,7 @@ const ( // DefaultReceiveBufferSize is the default size of the receive buffer // for an endpoint. - DefaultReceiveBufferSize = 1 << 20 // 1MB + DefaultReceiveBufferSize = 32 << 10 // 32KB // MaxBufferSize is the largest size a receive/send buffer can grow to. MaxBufferSize = 4 << 20 // 4MB diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go index dd89a292a..5e0bfe585 100644 --- a/pkg/tcpip/transport/tcp/rcv.go +++ b/pkg/tcpip/transport/tcp/rcv.go @@ -372,7 +372,7 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) { // We only store the segment if it's within our buffer // size limit. if r.pendingBufUsed < r.pendingBufSize { - r.pendingBufUsed += s.logicalLen() + r.pendingBufUsed += seqnum.Size(s.segMemSize()) s.incRef() heap.Push(&r.pendingRcvdSegments, s) UpdateSACKBlocks(&r.ep.sack, segSeq, segSeq.Add(segLen), r.rcvNxt) @@ -406,7 +406,7 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) { } heap.Pop(&r.pendingRcvdSegments) - r.pendingBufUsed -= s.logicalLen() + r.pendingBufUsed -= seqnum.Size(s.segMemSize()) s.decRef() } return false, nil diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go index 0280892a8..bb60dc29d 100644 --- a/pkg/tcpip/transport/tcp/segment.go +++ b/pkg/tcpip/transport/tcp/segment.go @@ -138,6 +138,12 @@ func (s *segment) logicalLen() seqnum.Size { return l } +// segMemSize is the amount of memory used to hold the segment data and +// the associated metadata. +func (s *segment) segMemSize() int { + return segSize + s.data.Size() +} + // parse populates the sequence & ack numbers, flags, and window fields of the // segment from the TCP header stored in the data. It then updates the view to // skip the header. diff --git a/pkg/tcpip/transport/tcp/segment_unsafe.go b/pkg/tcpip/transport/tcp/segment_unsafe.go new file mode 100644 index 000000000..0ab7b8f56 --- /dev/null +++ b/pkg/tcpip/transport/tcp/segment_unsafe.go @@ -0,0 +1,23 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp + +import ( + "unsafe" +) + +const ( + segSize = int(unsafe.Sizeof(segment{})) +) diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go index 06fde2a79..37e7767d6 100644 --- a/pkg/tcpip/transport/tcp/testing/context/context.go +++ b/pkg/tcpip/transport/tcp/testing/context/context.go @@ -143,12 +143,14 @@ func New(t *testing.T, mtu uint32) *Context { TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()}, }) + const sendBufferSize = 1 << 20 // 1 MiB + const recvBufferSize = 1 << 20 // 1 MiB // Allow minimum send/receive buffer sizes to be 1 during tests. - if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 1, Default: tcp.DefaultSendBufferSize, Max: 10 * tcp.DefaultSendBufferSize}); err != nil { + if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 1, Default: sendBufferSize, Max: 10 * sendBufferSize}); err != nil { t.Fatalf("SetTransportProtocolOption failed: %s", err) } - if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: tcp.DefaultReceiveBufferSize, Max: 10 * tcp.DefaultReceiveBufferSize}); err != nil { + if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: recvBufferSize, Max: 10 * recvBufferSize}); err != nil { t.Fatalf("SetTransportProtocolOption failed: %s", err) } @@ -202,7 +204,7 @@ func New(t *testing.T, mtu uint32) *Context { t: t, s: s, linkEP: ep, - WindowScale: uint8(tcp.FindWndScale(tcp.DefaultReceiveBufferSize)), + WindowScale: uint8(tcp.FindWndScale(recvBufferSize)), } } diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index a14643ae8..6e692da07 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -1451,7 +1451,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk packet.tos, _ = header.IPv6(pkt.NetworkHeader).TOS() } - packet.timestamp = e.stack.NowNanoseconds() + packet.timestamp = e.stack.Clock().NowNanoseconds() e.rcvMu.Unlock() diff --git a/pkg/test/criutil/criutil.go b/pkg/test/criutil/criutil.go index 66f10c016..70945f234 100644 --- a/pkg/test/criutil/criutil.go +++ b/pkg/test/criutil/criutil.go @@ -40,9 +40,9 @@ type Crictl struct { cleanup []func() } -// resolvePath attempts to find binary paths. It may set the path to invalid, +// ResolvePath attempts to find binary paths. It may set the path to invalid, // which will cause the execution to fail with a sensible error. -func resolvePath(executable string) string { +func ResolvePath(executable string) string { runtime, err := dockerutil.RuntimePath() if err == nil { // Check first the directory of the runtime itself. @@ -230,7 +230,7 @@ func (cc *Crictl) Import(image string) error { // be pushing a lot of bytes in order to import the image. The connect // timeout stays the same and is inherited from the Crictl instance. cmd := testutil.Command(cc.logger, - resolvePath("ctr"), + ResolvePath("ctr"), fmt.Sprintf("--connect-timeout=%s", 30*time.Second), fmt.Sprintf("--address=%s", cc.endpoint), "-n", "k8s.io", "images", "import", "-") @@ -358,7 +358,7 @@ func (cc *Crictl) StopPodAndContainer(podID, contID string) error { // run runs crictl with the given args. func (cc *Crictl) run(args ...string) (string, error) { defaultArgs := []string{ - resolvePath("crictl"), + ResolvePath("crictl"), "--image-endpoint", fmt.Sprintf("unix://%s", cc.endpoint), "--runtime-endpoint", fmt.Sprintf("unix://%s", cc.endpoint), } diff --git a/pkg/test/dockerutil/BUILD b/pkg/test/dockerutil/BUILD index 83b80c8bc..a5e84658a 100644 --- a/pkg/test/dockerutil/BUILD +++ b/pkg/test/dockerutil/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,6 +10,7 @@ go_library( "dockerutil.go", "exec.go", "network.go", + "profile.go", ], visibility = ["//:sandbox"], deps = [ @@ -23,3 +24,19 @@ go_library( "@com_github_docker_go_connections//nat:go_default_library", ], ) + +go_test( + name = "profile_test", + size = "large", + srcs = [ + "profile_test.go", + ], + library = ":dockerutil", + tags = [ + # Requires docker and runsc to be configured before test runs. + # Also requires the test to be run as root. + "manual", + "local", + ], + visibility = ["//:sandbox"], +) diff --git a/pkg/test/dockerutil/README.md b/pkg/test/dockerutil/README.md new file mode 100644 index 000000000..870292096 --- /dev/null +++ b/pkg/test/dockerutil/README.md @@ -0,0 +1,86 @@ +# dockerutil + +This package is for creating and controlling docker containers for testing +runsc, gVisor's docker/kubernetes binary. A simple test may look like: + +``` + func TestSuperCool(t *testing.T) { + ctx := context.Background() + c := dockerutil.MakeContainer(ctx, t) + got, err := c.Run(ctx, dockerutil.RunOpts{ + Image: "basic/alpine" + }, "echo", "super cool") + if err != nil { + t.Fatalf("err was not nil: %v", err) + } + want := "super cool" + if !strings.Contains(got, want){ + t.Fatalf("want: %s, got: %s", want, got) + } + } +``` + +For further examples, see many of our end to end tests elsewhere in the repo, +such as those in //test/e2e or benchmarks at //test/benchmarks. + +dockerutil uses the "official" docker golang api, which is +[very powerful](https://godoc.org/github.com/docker/docker/client). dockerutil +is a thin wrapper around this API, allowing desired new use cases to be easily +implemented. + +## Profiling + +dockerutil is capable of generating profiles. Currently, the only option is to +use pprof profiles generated by `runsc debug`. The profiler will generate Block, +CPU, Heap, Goroutine, and Mutex profiles. To generate profiles: + +* Install runsc with the `--profile` flag: `make configure RUNTIME=myrunsc + ARGS="--profile"` Also add other flags with ARGS like `--platform=kvm` or + `--vfs2`. +* Restart docker: `sudo service docker restart` + +To run and generate CPU profiles run: + +``` +make sudo TARGETS=//path/to:target \ + ARGS="--runtime=myrunsc -test.v -test.bench=. --pprof-cpu" OPTIONS="-c opt" +``` + +Profiles would be at: `/tmp/profile/myrunsc/CONTAINERNAME/cpu.pprof` + +Container name in most tests and benchmarks in gVisor is usually the test name +and some random characters like so: +`BenchmarkABSL-CleanCache-JF2J2ZYF3U7SL47QAA727CSJI3C4ZAW2` + +Profiling requires root as runsc debug inspects running containers in /var/run +among other things. + +### Writing for Profiling + +The below shows an example of using profiles with dockerutil. + +``` +func TestSuperCool(t *testing.T){ + ctx := context.Background() + // profiled and using runtime from dockerutil.runtime flag + profiled := MakeContainer() + + // not profiled and using runtime runc + native := MakeNativeContainer() + + err := profiled.Spawn(ctx, RunOpts{ + Image: "some/image", + }, "sleep", "100000") + // profiling has begun here + ... + expensive setup that I don't want to profile. + ... + profiled.RestartProfiles() + // profiled activity +} +``` + +In the above example, `profiled` would be profiled and `native` would not. The +call to `RestartProfiles()` restarts the clock on profiling. This is useful if +the main activity being tested is done with `docker exec` or `container.Spawn()` +followed by one or more `container.Exec()` calls. diff --git a/pkg/test/dockerutil/container.go b/pkg/test/dockerutil/container.go index 17acdaf6f..b59503188 100644 --- a/pkg/test/dockerutil/container.go +++ b/pkg/test/dockerutil/container.go @@ -43,15 +43,21 @@ import ( // See: https://pkg.go.dev/github.com/docker/docker. type Container struct { Name string - Runtime string + runtime string logger testutil.Logger client *client.Client id string mounts []mount.Mount links []string - cleanups []func() copyErr error + cleanups []func() + + // Profiles are profiles added to this container. They contain methods + // that are run after Creation, Start, and Cleanup of this Container, along + // a handle to restart the profile. Generally, tests/benchmarks using + // profiles need to run as root. + profiles []Profile // Stores streams attached to the container. Used by WaitForOutputSubmatch. streams types.HijackedResponse @@ -106,7 +112,19 @@ type RunOpts struct { // MakeContainer sets up the struct for a Docker container. // // Names of containers will be unique. +// Containers will check flags for profiling requests. func MakeContainer(ctx context.Context, logger testutil.Logger) *Container { + c := MakeNativeContainer(ctx, logger) + c.runtime = *runtime + if p := MakePprofFromFlags(c); p != nil { + c.AddProfile(p) + } + return c +} + +// MakeNativeContainer sets up the struct for a DockerContainer using runc. Native +// containers aren't profiled. +func MakeNativeContainer(ctx context.Context, logger testutil.Logger) *Container { // Slashes are not allowed in container names. name := testutil.RandomID(logger.Name()) name = strings.ReplaceAll(name, "/", "-") @@ -114,20 +132,33 @@ func MakeContainer(ctx context.Context, logger testutil.Logger) *Container { if err != nil { return nil } - client.NegotiateAPIVersion(ctx) - return &Container{ logger: logger, Name: name, - Runtime: *runtime, + runtime: "", client: client, } } +// AddProfile adds a profile to this container. +func (c *Container) AddProfile(p Profile) { + c.profiles = append(c.profiles, p) +} + +// RestartProfiles calls Restart on all profiles for this container. +func (c *Container) RestartProfiles() error { + for _, profile := range c.profiles { + if err := profile.Restart(c); err != nil { + return err + } + } + return nil +} + // Spawn is analogous to 'docker run -d'. func (c *Container) Spawn(ctx context.Context, r RunOpts, args ...string) error { - if err := c.create(ctx, r, args); err != nil { + if err := c.create(ctx, c.config(r, args), c.hostConfig(r), nil); err != nil { return err } return c.Start(ctx) @@ -153,7 +184,7 @@ func (c *Container) SpawnProcess(ctx context.Context, r RunOpts, args ...string) // Run is analogous to 'docker run'. func (c *Container) Run(ctx context.Context, r RunOpts, args ...string) (string, error) { - if err := c.create(ctx, r, args); err != nil { + if err := c.create(ctx, c.config(r, args), c.hostConfig(r), nil); err != nil { return "", err } @@ -181,27 +212,25 @@ func (c *Container) MakeLink(target string) string { // CreateFrom creates a container from the given configs. func (c *Container) CreateFrom(ctx context.Context, conf *container.Config, hostconf *container.HostConfig, netconf *network.NetworkingConfig) error { - cont, err := c.client.ContainerCreate(ctx, conf, hostconf, netconf, c.Name) - if err != nil { - return err - } - c.id = cont.ID - return nil + return c.create(ctx, conf, hostconf, netconf) } // Create is analogous to 'docker create'. func (c *Container) Create(ctx context.Context, r RunOpts, args ...string) error { - return c.create(ctx, r, args) + return c.create(ctx, c.config(r, args), c.hostConfig(r), nil) } -func (c *Container) create(ctx context.Context, r RunOpts, args []string) error { - conf := c.config(r, args) - hostconf := c.hostConfig(r) +func (c *Container) create(ctx context.Context, conf *container.Config, hostconf *container.HostConfig, netconf *network.NetworkingConfig) error { cont, err := c.client.ContainerCreate(ctx, conf, hostconf, nil, c.Name) if err != nil { return err } c.id = cont.ID + for _, profile := range c.profiles { + if err := profile.OnCreate(c); err != nil { + return fmt.Errorf("OnCreate method failed with: %v", err) + } + } return nil } @@ -227,7 +256,7 @@ func (c *Container) hostConfig(r RunOpts) *container.HostConfig { c.mounts = append(c.mounts, r.Mounts...) return &container.HostConfig{ - Runtime: c.Runtime, + Runtime: c.runtime, Mounts: c.mounts, PublishAllPorts: true, Links: r.Links, @@ -261,8 +290,15 @@ func (c *Container) Start(ctx context.Context) error { c.cleanups = append(c.cleanups, func() { c.streams.Close() }) - - return c.client.ContainerStart(ctx, c.id, types.ContainerStartOptions{}) + if err := c.client.ContainerStart(ctx, c.id, types.ContainerStartOptions{}); err != nil { + return fmt.Errorf("ContainerStart failed: %v", err) + } + for _, profile := range c.profiles { + if err := profile.OnStart(c); err != nil { + return fmt.Errorf("OnStart method failed: %v", err) + } + } + return nil } // Stop is analogous to 'docker stop'. @@ -482,6 +518,12 @@ func (c *Container) Remove(ctx context.Context) error { // CleanUp kills and deletes the container (best effort). func (c *Container) CleanUp(ctx context.Context) { + // Execute profile cleanups before the container goes down. + for _, profile := range c.profiles { + profile.OnCleanUp(c) + } + // Forget profiles. + c.profiles = nil // Kill the container. if err := c.Kill(ctx); err != nil && !strings.Contains(err.Error(), "is not running") { // Just log; can't do anything here. diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go index df09babf3..5a9dd8bd8 100644 --- a/pkg/test/dockerutil/dockerutil.go +++ b/pkg/test/dockerutil/dockerutil.go @@ -25,6 +25,7 @@ import ( "os/exec" "regexp" "strconv" + "time" "gvisor.dev/gvisor/pkg/test/testutil" ) @@ -42,6 +43,26 @@ var ( // config is the default Docker daemon configuration path. config = flag.String("config_path", "/etc/docker/daemon.json", "configuration file for reading paths") + + // The following flags are for the "pprof" profiler tool. + + // pprofBaseDir allows the user to change the directory to which profiles are + // written. By default, profiles will appear under: + // /tmp/profile/RUNTIME/CONTAINER_NAME/*.pprof. + pprofBaseDir = flag.String("pprof-dir", "/tmp/profile", "base directory in: BASEDIR/RUNTIME/CONTINER_NAME/FILENAME (e.g. /tmp/profile/runtime/mycontainer/cpu.pprof)") + + // duration is the max duration `runsc debug` will run and capture profiles. + // If the container's clean up method is called prior to duration, the + // profiling process will be killed. + duration = flag.Duration("pprof-duration", 10*time.Second, "duration to run the profile in seconds") + + // The below flags enable each type of profile. Multiple profiles can be + // enabled for each run. + pprofBlock = flag.Bool("pprof-block", false, "enables block profiling with runsc debug") + pprofCPU = flag.Bool("pprof-cpu", false, "enables CPU profiling with runsc debug") + pprofGo = flag.Bool("pprof-go", false, "enables goroutine profiling with runsc debug") + pprofHeap = flag.Bool("pprof-heap", false, "enables heap profiling with runsc debug") + pprofMutex = flag.Bool("pprof-mutex", false, "enables mutex profiling with runsc debug") ) // EnsureSupportedDockerVersion checks if correct docker is installed. diff --git a/pkg/test/dockerutil/exec.go b/pkg/test/dockerutil/exec.go index 921d1da9e..4c739c9e9 100644 --- a/pkg/test/dockerutil/exec.go +++ b/pkg/test/dockerutil/exec.go @@ -87,7 +87,6 @@ func (c *Container) doExec(ctx context.Context, r ExecOpts, args []string) (Proc execid: resp.ID, conn: hijack, }, nil - } func (c *Container) execConfig(r ExecOpts, cmd []string) types.ExecConfig { diff --git a/pkg/test/dockerutil/profile.go b/pkg/test/dockerutil/profile.go new file mode 100644 index 000000000..1fab33083 --- /dev/null +++ b/pkg/test/dockerutil/profile.go @@ -0,0 +1,152 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dockerutil + +import ( + "context" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "time" +) + +// Profile represents profile-like operations on a container, +// such as running perf or pprof. It is meant to be added to containers +// such that the container type calls the Profile during its lifecycle. +type Profile interface { + // OnCreate is called just after the container is created when the container + // has a valid ID (e.g. c.ID()). + OnCreate(c *Container) error + + // OnStart is called just after the container is started when the container + // has a valid Pid (e.g. c.SandboxPid()). + OnStart(c *Container) error + + // Restart restarts the Profile on request. + Restart(c *Container) error + + // OnCleanUp is called during the container's cleanup method. + // Cleanups should just log errors if they have them. + OnCleanUp(c *Container) error +} + +// Pprof is for running profiles with 'runsc debug'. Pprof workloads +// should be run as root and ONLY against runsc sandboxes. The runtime +// should have --profile set as an option in /etc/docker/daemon.json in +// order for profiling to work with Pprof. +type Pprof struct { + BasePath string // path to put profiles + BlockProfile bool + CPUProfile bool + GoRoutineProfile bool + HeapProfile bool + MutexProfile bool + Duration time.Duration // duration to run profiler e.g. '10s' or '1m'. + shouldRun bool + cmd *exec.Cmd + stdout io.ReadCloser + stderr io.ReadCloser +} + +// MakePprofFromFlags makes a Pprof profile from flags. +func MakePprofFromFlags(c *Container) *Pprof { + if !(*pprofBlock || *pprofCPU || *pprofGo || *pprofHeap || *pprofMutex) { + return nil + } + return &Pprof{ + BasePath: filepath.Join(*pprofBaseDir, c.runtime, c.Name), + BlockProfile: *pprofBlock, + CPUProfile: *pprofCPU, + GoRoutineProfile: *pprofGo, + HeapProfile: *pprofHeap, + MutexProfile: *pprofMutex, + Duration: *duration, + } +} + +// OnCreate implements Profile.OnCreate. +func (p *Pprof) OnCreate(c *Container) error { + return os.MkdirAll(p.BasePath, 0755) +} + +// OnStart implements Profile.OnStart. +func (p *Pprof) OnStart(c *Container) error { + path, err := RuntimePath() + if err != nil { + return fmt.Errorf("failed to get runtime path: %v", err) + } + + // The root directory of this container's runtime. + root := fmt.Sprintf("--root=/var/run/docker/runtime-%s/moby", c.runtime) + // Format is `runsc --root=rootdir debug --profile-*=file --duration=* containerID`. + args := []string{root, "debug"} + args = append(args, p.makeProfileArgs(c)...) + args = append(args, c.ID()) + + // Best effort wait until container is running. + for now := time.Now(); time.Since(now) < 5*time.Second; { + if status, err := c.Status(context.Background()); err != nil { + return fmt.Errorf("failed to get status with: %v", err) + + } else if status.Running { + break + } + time.Sleep(500 * time.Millisecond) + } + p.cmd = exec.Command(path, args...) + if err := p.cmd.Start(); err != nil { + return fmt.Errorf("process failed: %v", err) + } + return nil +} + +// Restart implements Profile.Restart. +func (p *Pprof) Restart(c *Container) error { + p.OnCleanUp(c) + return p.OnStart(c) +} + +// OnCleanUp implements Profile.OnCleanup +func (p *Pprof) OnCleanUp(c *Container) error { + defer func() { p.cmd = nil }() + if p.cmd != nil && p.cmd.Process != nil && p.cmd.ProcessState != nil && !p.cmd.ProcessState.Exited() { + return p.cmd.Process.Kill() + } + return nil +} + +// makeProfileArgs turns Pprof fields into runsc debug flags. +func (p *Pprof) makeProfileArgs(c *Container) []string { + var ret []string + if p.BlockProfile { + ret = append(ret, fmt.Sprintf("--profile-block=%s", filepath.Join(p.BasePath, "block.pprof"))) + } + if p.CPUProfile { + ret = append(ret, fmt.Sprintf("--profile-cpu=%s", filepath.Join(p.BasePath, "cpu.pprof"))) + } + if p.GoRoutineProfile { + ret = append(ret, fmt.Sprintf("--profile-goroutine=%s", filepath.Join(p.BasePath, "go.pprof"))) + } + if p.HeapProfile { + ret = append(ret, fmt.Sprintf("--profile-heap=%s", filepath.Join(p.BasePath, "heap.pprof"))) + } + if p.MutexProfile { + ret = append(ret, fmt.Sprintf("--profile-mutex=%s", filepath.Join(p.BasePath, "mutex.pprof"))) + } + ret = append(ret, fmt.Sprintf("--duration=%s", p.Duration)) + return ret +} diff --git a/pkg/test/dockerutil/profile_test.go b/pkg/test/dockerutil/profile_test.go new file mode 100644 index 000000000..b7b4d7618 --- /dev/null +++ b/pkg/test/dockerutil/profile_test.go @@ -0,0 +1,117 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dockerutil + +import ( + "context" + "fmt" + "os" + "path/filepath" + "testing" + "time" +) + +type testCase struct { + name string + pprof Pprof + expectedFiles []string +} + +func TestPprof(t *testing.T) { + // Basepath and expected file names for each type of profile. + basePath := "/tmp/test/profile" + block := "block.pprof" + cpu := "cpu.pprof" + goprofle := "go.pprof" + heap := "heap.pprof" + mutex := "mutex.pprof" + + testCases := []testCase{ + { + name: "Cpu", + pprof: Pprof{ + BasePath: basePath, + CPUProfile: true, + Duration: 2 * time.Second, + }, + expectedFiles: []string{cpu}, + }, + { + name: "All", + pprof: Pprof{ + BasePath: basePath, + BlockProfile: true, + CPUProfile: true, + GoRoutineProfile: true, + HeapProfile: true, + MutexProfile: true, + Duration: 2 * time.Second, + }, + expectedFiles: []string{block, cpu, goprofle, heap, mutex}, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctx := context.Background() + c := MakeContainer(ctx, t) + // Set basepath to include the container name so there are no conflicts. + tc.pprof.BasePath = filepath.Join(tc.pprof.BasePath, c.Name) + c.AddProfile(&tc.pprof) + + func() { + defer c.CleanUp(ctx) + // Start a container. + if err := c.Spawn(ctx, RunOpts{ + Image: "basic/alpine", + }, "sleep", "1000"); err != nil { + t.Fatalf("run failed with: %v", err) + } + + if status, err := c.Status(context.Background()); !status.Running { + t.Fatalf("container is not yet running: %+v err: %v", status, err) + } + + // End early if the expected files exist and have data. + for start := time.Now(); time.Since(start) < tc.pprof.Duration; time.Sleep(500 * time.Millisecond) { + if err := checkFiles(tc); err == nil { + break + } + } + }() + + // Check all expected files exist and have data. + if err := checkFiles(tc); err != nil { + t.Fatalf(err.Error()) + } + }) + } +} + +func checkFiles(tc testCase) error { + for _, file := range tc.expectedFiles { + stat, err := os.Stat(filepath.Join(tc.pprof.BasePath, file)) + if err != nil { + return fmt.Errorf("stat failed with: %v", err) + } else if stat.Size() < 1 { + return fmt.Errorf("file not written to: %+v", stat) + } + } + return nil +} + +func TestMain(m *testing.M) { + EnsureSupportedDockerVersion() + os.Exit(m.Run()) +} diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 55d45aaa6..9f52438c2 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -90,6 +90,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/link/fdbased", "//pkg/tcpip/link/loopback", + "//pkg/tcpip/link/packetsocket", "//pkg/tcpip/link/qdisc/fifo", "//pkg/tcpip/link/sniffer", "//pkg/tcpip/network/arp", diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 14d2f56a5..4e1fa7665 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket" "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" @@ -252,6 +253,9 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) } + // Enable support for AF_PACKET sockets to receive outgoing packets. + linkEP = packetsocket.New(linkEP) + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 56f4ba15d..9a1ed8e9e 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -77,6 +77,10 @@ func registerFilesystems(k *kernel.Kernel) error { AllowUserMount: true, AllowUserList: true, }) + vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) // Setup files in devtmpfs. if err := memdev.Register(vfsObj); err != nil { @@ -119,6 +123,7 @@ func registerFilesystems(k *kernel.Kernel) error { return fmt.Errorf("creating fusedev devtmpfs files: %w", err) } } + return nil } diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index cd76645bd..5e8247bc8 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -643,7 +643,9 @@ func TestExec(t *testing.T) { if err != nil { t.Fatalf("error creating temporary directory: %v", err) } - cmd := fmt.Sprintf("ln -s /bin/true %q/symlink && sleep 100", dir) + // Note that some shells may exec the final command in a sequence as + // an optimization. We avoid this here by adding the exit 0. + cmd := fmt.Sprintf("ln -s /bin/true %q/symlink && sleep 100 && exit 0", dir) spec := testutil.NewSpecWithArgs("sh", "-c", cmd) _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 1036b0630..05e3637f7 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -31,5 +31,6 @@ go_test( deps = [ "//pkg/log", "//pkg/p9", + "//pkg/test/testutil", ], ) diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index b7521bda7..ebefeacf2 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -48,36 +48,6 @@ const ( openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC ) -type fileType int - -const ( - regular fileType = iota - directory - symlink - socket - unknown -) - -// String implements fmt.Stringer. -func (f fileType) String() string { - switch f { - case regular: - return "regular" - case directory: - return "directory" - case symlink: - return "symlink" - case socket: - return "socket" - } - return "unknown" -} - -// ControlSocketAddr generates an abstract unix socket name for the given id. -func ControlSocketAddr(id string) string { - return fmt.Sprintf("\x00runsc-gofer.%s", id) -} - // Config sets configuration options for each attach point. type Config struct { // ROMount is set to true if this is a readonly mount. @@ -132,7 +102,7 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) } - f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) { + f, readable, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) { return fd.Open(a.prefix, openFlags|mode, 0) }) if err != nil { @@ -144,7 +114,7 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err) } - lf, err := newLocalFile(a, f, a.prefix, stat) + lf, err := newLocalFile(a, f, a.prefix, readable, stat) if err != nil { return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err) } @@ -199,8 +169,6 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID { // entire file up when it's opened in write mode, and would perform badly when // multiple files are only being opened for read (esp. startup). type localFile struct { - p9.DefaultWalkGetAttr - // attachPoint is the attachPoint that serves this localFile. attachPoint *attachPoint @@ -212,12 +180,19 @@ type localFile struct { // opened with. file *fd.FD + // controlReadable tells whether 'file' was opened with read permissions + // during a walk. + controlReadable bool + // mode is the mode in which the file was opened. Set to invalidMode // if localFile isn't opened. mode p9.OpenFlags - // ft is the fileType for this file. - ft fileType + // fileType for this file. It is equivalent to: + // syscall.Stat_t.Mode & syscall.S_IFMT + fileType uint32 + + qid p9.QID // readDirMu protects against concurrent Readdir calls. readDirMu sync.Mutex @@ -251,83 +226,88 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) { return fd.New(d), nil } -func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) { +func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) { path := path.Join(parent.hostPath, name) - f, err := openAnyFile(path, func(mode int) (*fd.FD, error) { + f, readable, err := openAnyFile(path, func(mode int) (*fd.FD, error) { return fd.OpenAt(parent.file, name, openFlags|mode, 0) }) - return f, path, err + return f, path, readable, err } // openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback // to O_PATH. 'path' is used for logging messages only. 'fn' is what does the // actual file open and is customizable by the caller. -func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) { +func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool, error) { // Attempt to open file in the following mode in order: // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs. // Use non-blocking to prevent getting stuck inside open(2) for // FIFOs. This option has no effect on regular files. // 2. PATH: for symlinks, sockets. - modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH} + options := []struct { + mode int + readable bool + }{ + { + mode: syscall.O_RDONLY | syscall.O_NONBLOCK, + readable: true, + }, + { + mode: unix.O_PATH, + readable: false, + }, + } var err error - var file *fd.FD - for i, mode := range modes { - file, err = fn(mode) + for i, option := range options { + var file *fd.FD + file, err = fn(option.mode) if err == nil { - // openat succeeded, we're done. - break + // Succeeded opening the file, we're done. + return file, option.readable, nil } switch e := extractErrno(err); e { case syscall.ENOENT: // File doesn't exist, no point in retrying. - return nil, e + return nil, false, e } - // openat failed. Try again with next mode, preserving 'err' in case this - // was the last attempt. - log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err) - } - if err != nil { - // All attempts to open file have failed, return the last error. - log.Debugf("Failed to open file, path: %q, err: %v", path, err) - return nil, extractErrno(err) + // File failed to open. Try again with next mode, preserving 'err' in case + // this was the last attempt. + log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|option.mode, path, err) } - - return file, nil + // All attempts to open file have failed, return the last error. + log.Debugf("Failed to open file, path: %q, err: %v", path, err) + return nil, false, extractErrno(err) } -func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) { - var ft fileType +func checkSupportedFileType(stat syscall.Stat_t, permitSocket bool) error { switch stat.Mode & syscall.S_IFMT { - case syscall.S_IFREG: - ft = regular - case syscall.S_IFDIR: - ft = directory - case syscall.S_IFLNK: - ft = symlink + case syscall.S_IFREG, syscall.S_IFDIR, syscall.S_IFLNK: + return nil + case syscall.S_IFSOCK: if !permitSocket { - return unknown, syscall.EPERM + return syscall.EPERM } - ft = socket + return nil + default: - return unknown, syscall.EPERM + return syscall.EPERM } - return ft, nil } -func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) { - ft, err := getSupportedFileType(stat, a.conf.HostUDS) - if err != nil { +func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat syscall.Stat_t) (*localFile, error) { + if err := checkSupportedFileType(stat, a.conf.HostUDS); err != nil { return nil, err } return &localFile{ - attachPoint: a, - hostPath: path, - file: file, - mode: invalidMode, - ft: ft, + attachPoint: a, + hostPath: path, + file: file, + mode: invalidMode, + fileType: stat.Mode & syscall.S_IFMT, + qid: a.makeQID(stat), + controlReadable: readable, }, nil } @@ -346,7 +326,7 @@ func newFDMaybe(file *fd.FD) *fd.FD { // fd is blocking; non-blocking is required. if err := syscall.SetNonblock(dup.FD(), true); err != nil { - dup.Close() + _ = dup.Close() return nil } return dup @@ -380,7 +360,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { // Check if control file can be used or if a new open must be created. var newFile *fd.FD - if flags == p9.ReadOnly { + if flags == p9.ReadOnly && l.controlReadable { log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath) newFile = l.file } else { @@ -396,16 +376,8 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { } } - stat, err := fstat(newFile.FD()) - if err != nil { - if newFile != l.file { - newFile.Close() - } - return nil, p9.QID{}, 0, extractErrno(err) - } - var fd *fd.FD - if stat.Mode&syscall.S_IFMT == syscall.S_IFREG { + if l.fileType == syscall.S_IFREG { // Donate FD for regular files only. fd = newFDMaybe(newFile) } @@ -418,7 +390,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { l.file = newFile } l.mode = flags & p9.OpenFlagsModeMask - return fd, l.attachPoint.makeQID(stat), 0, nil + return fd, l.qid, 0, nil } // Create implements p9.File. @@ -446,7 +418,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid return nil, nil, p9.QID{}, 0, extractErrno(err) } cu := cleanup.Make(func() { - child.Close() + _ = child.Close() // Best effort attempt to remove the file in case of failure. if err := syscall.Unlinkat(l.file.FD(), name); err != nil { log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err) @@ -467,10 +439,12 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid hostPath: path.Join(l.hostPath, name), file: child, mode: mode, + fileType: syscall.S_IFREG, + qid: l.attachPoint.makeQID(stat), } cu.Release() - return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil + return newFDMaybe(c.file), c, c.qid, 0, nil } // Mkdir implements p9.File. @@ -516,55 +490,74 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) // Walk implements p9.File. func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) { + qids, file, _, err := l.walk(names) + return qids, file, err +} + +// WalkGetAttr implements p9.File. +func (l *localFile) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, p9.Attr, error) { + qids, file, stat, err := l.walk(names) + if err != nil { + return nil, nil, p9.AttrMask{}, p9.Attr{}, err + } + mask, attr := l.fillAttr(stat) + return qids, file, mask, attr, nil +} + +func (l *localFile) walk(names []string) ([]p9.QID, p9.File, syscall.Stat_t, error) { // Duplicate current file if 'names' is empty. if len(names) == 0 { - newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) { + newFile, readable, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) { return reopenProcFd(l.file, openFlags|mode) }) if err != nil { - return nil, nil, extractErrno(err) + return nil, nil, syscall.Stat_t{}, extractErrno(err) } stat, err := fstat(newFile.FD()) if err != nil { - newFile.Close() - return nil, nil, extractErrno(err) + _ = newFile.Close() + return nil, nil, syscall.Stat_t{}, extractErrno(err) } c := &localFile{ - attachPoint: l.attachPoint, - hostPath: l.hostPath, - file: newFile, - mode: invalidMode, + attachPoint: l.attachPoint, + hostPath: l.hostPath, + file: newFile, + mode: invalidMode, + fileType: l.fileType, + qid: l.attachPoint.makeQID(stat), + controlReadable: readable, } - return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil + return []p9.QID{c.qid}, c, stat, nil } var qids []p9.QID + var lastStat syscall.Stat_t last := l for _, name := range names { - f, path, err := openAnyFileFromParent(last, name) + f, path, readable, err := openAnyFileFromParent(last, name) if last != l { - last.Close() + _ = last.Close() } if err != nil { - return nil, nil, extractErrno(err) + return nil, nil, syscall.Stat_t{}, extractErrno(err) } - stat, err := fstat(f.FD()) + lastStat, err = fstat(f.FD()) if err != nil { - f.Close() - return nil, nil, extractErrno(err) + _ = f.Close() + return nil, nil, syscall.Stat_t{}, extractErrno(err) } - c, err := newLocalFile(last.attachPoint, f, path, stat) + c, err := newLocalFile(last.attachPoint, f, path, readable, lastStat) if err != nil { - f.Close() - return nil, nil, extractErrno(err) + _ = f.Close() + return nil, nil, syscall.Stat_t{}, extractErrno(err) } - qids = append(qids, l.attachPoint.makeQID(stat)) + qids = append(qids, c.qid) last = c } - return qids, last, nil + return qids, last, lastStat, nil } // StatFS implements p9.File. @@ -604,7 +597,11 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) if err != nil { return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err) } + mask, attr := l.fillAttr(stat) + return l.qid, mask, attr, nil +} +func (l *localFile) fillAttr(stat syscall.Stat_t) (p9.AttrMask, p9.Attr) { attr := p9.Attr{ Mode: p9.FileMode(stat.Mode), UID: p9.UID(stat.Uid), @@ -633,8 +630,7 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) MTime: true, CTime: true, } - - return l.attachPoint.makeQID(stat), valid, attr, nil + return valid, attr } // SetAttr implements p9.File. Due to mismatch in file API, options @@ -675,7 +671,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { // Check if it's possible to use cached file, or if another one needs to be // opened for write. f := l.file - if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { + if l.fileType == syscall.S_IFREG && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { var err error f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY) if err != nil { @@ -731,7 +727,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { } } - if l.ft == symlink { + if l.fileType == syscall.S_IFLNK { // utimensat operates different that other syscalls. To operate on a // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty // name. @@ -915,7 +911,7 @@ func (l *localFile) Link(target p9.File, newName string) error { } // Mknod implements p9.File. -func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { +func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) { conf := l.attachPoint.conf if conf.ROMount { if conf.PanicOnWrite { @@ -1113,13 +1109,13 @@ func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) { } if err := syscall.SetNonblock(f, true); err != nil { - syscall.Close(f) + _ = syscall.Close(f) return nil, err } sa := syscall.SockaddrUnix{Name: l.hostPath} if err := syscall.Connect(f, &sa); err != nil { - syscall.Close(f) + _ = syscall.Close(f) return nil, err } diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index 05af7e397..94f167417 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -26,6 +26,19 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/test/testutil" +) + +var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} + +var ( + allTypes = []uint32{syscall.S_IFREG, syscall.S_IFDIR, syscall.S_IFLNK} + + // allConfs is set in init(). + allConfs []Config + + rwConfs = []Config{{ROMount: false}} + roConfs = []Config{{ROMount: true}} ) func init() { @@ -39,6 +52,13 @@ func init() { } } +func configTestName(config *Config) string { + if config.ROMount { + return "ROMount" + } + return "RWMount" +} + func assertPanic(t *testing.T, f func()) { defer func() { if r := recover(); r == nil { @@ -88,71 +108,76 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error { return nil } -var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} - -var ( - allTypes = []fileType{regular, directory, symlink} - - // allConfs is set in init() above. - allConfs []Config - - rwConfs = []Config{{ROMount: false}} - roConfs = []Config{{ROMount: true}} -) - type state struct { - root *localFile - file *localFile - conf Config - ft fileType + root *localFile + file *localFile + conf Config + fileType uint32 } func (s state) String() string { - return fmt.Sprintf("type(%v)", s.ft) + return fmt.Sprintf("type(%v)", s.fileType) +} + +func typeName(fileType uint32) string { + switch fileType { + case syscall.S_IFREG: + return "file" + case syscall.S_IFDIR: + return "directory" + case syscall.S_IFLNK: + return "symlink" + default: + panic(fmt.Sprintf("invalid file type for test: %d", fileType)) + } } func runAll(t *testing.T, test func(*testing.T, state)) { runCustom(t, allTypes, allConfs, test) } -func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) { +func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing.T, state)) { for _, c := range confs { - t.Logf("Config: %+v", c) - for _, ft := range types { - t.Logf("File type: %v", ft) + name := fmt.Sprintf("%s/%s", configTestName(&c), typeName(ft)) + t.Run(name, func(t *testing.T) { + path, name, err := setup(ft) + if err != nil { + t.Fatalf("%v", err) + } + defer os.RemoveAll(path) - path, name, err := setup(ft) - if err != nil { - t.Fatalf("%v", err) - } - defer os.RemoveAll(path) + a, err := NewAttachPoint(path, c) + if err != nil { + t.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + t.Fatalf("Attach failed, err: %v", err) + } - a, err := NewAttachPoint(path, c) - if err != nil { - t.Fatalf("NewAttachPoint failed: %v", err) - } - root, err := a.Attach() - if err != nil { - t.Fatalf("Attach failed, err: %v", err) - } + _, file, err := root.Walk([]string{name}) + if err != nil { + root.Close() + t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err) + } - _, file, err := root.Walk([]string{name}) - if err != nil { + st := state{ + root: root.(*localFile), + file: file.(*localFile), + conf: c, + fileType: ft, + } + test(t, st) + file.Close() root.Close() - t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err) - } - - st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft} - test(t, st) - file.Close() - root.Close() + }) } } } -func setup(ft fileType) (string, string, error) { - path, err := ioutil.TempDir("", "root-") +func setup(fileType uint32) (string, string, error) { + path, err := ioutil.TempDir(testutil.TmpDir(), "root-") if err != nil { return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err) } @@ -169,26 +194,26 @@ func setup(ft fileType) (string, string, error) { defer root.Close() var name string - switch ft { - case regular: + switch fileType { + case syscall.S_IFREG: name = "file" _, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) if err != nil { return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err) } defer f.Close() - case directory: + case syscall.S_IFDIR: name = "dir" if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err) } - case symlink: + case syscall.S_IFLNK: name = "symlink" if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err) } default: - panic(fmt.Sprintf("unknown file type %v", ft)) + panic(fmt.Sprintf("unknown file type %v", fileType)) } return path, name, nil } @@ -202,7 +227,7 @@ func createFile(dir *localFile, name string) (*localFile, error) { } func TestReadWrite(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) { child, err := createFile(s.file, "test") if err != nil { t.Fatalf("%v: createFile() failed, err: %v", s, err) @@ -232,7 +257,7 @@ func TestReadWrite(t *testing.T) { } func TestCreate(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) { for i, flags := range allOpenFlags { _, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) if err != nil { @@ -249,7 +274,7 @@ func TestCreate(t *testing.T) { // TestReadWriteDup tests that a file opened in any mode can be dup'ed and // reopened in any other mode. func TestReadWriteDup(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) { child, err := createFile(s.file, "test") if err != nil { t.Fatalf("%v: createFile() failed, err: %v", s, err) @@ -291,7 +316,7 @@ func TestReadWriteDup(t *testing.T) { } func TestUnopened(t *testing.T) { - runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFREG}, allConfs, func(t *testing.T, s state) { b := []byte("foobar") if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF { t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err) @@ -308,6 +333,32 @@ func TestUnopened(t *testing.T) { }) } +// TestOpenOPath is a regression test to ensure that a file that cannot be open +// for read is allowed to be open. This was happening because the control file +// was open with O_PATH, but Open() was not checking for it and allowing the +// control file to be reused. +func TestOpenOPath(t *testing.T) { + runCustom(t, []uint32{syscall.S_IFREG}, rwConfs, func(t *testing.T, s state) { + // Fist remove all permissions on the file. + if err := s.file.SetAttr(p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(0)}); err != nil { + t.Fatalf("SetAttr(): %v", err) + } + // Then walk to the file again to open a new control file. + filename := filepath.Base(s.file.hostPath) + _, newFile, err := s.root.Walk([]string{filename}) + if err != nil { + t.Fatalf("root.Walk(%q): %v", filename, err) + } + + if newFile.(*localFile).controlReadable { + t.Fatalf("control file didn't open with O_PATH: %+v", newFile) + } + if _, _, _, err := newFile.Open(p9.ReadOnly); err != syscall.EACCES { + t.Fatalf("Open() should have failed, got: %v, wanted: EACCES", err) + } + }) +} + func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, error) { if err := l.SetAttr(valid, attr); err != nil { return p9.Attr{}, err @@ -324,7 +375,7 @@ func TestSetAttrPerm(t *testing.T) { valid := p9.SetAttrMask{Permissions: true} attr := p9.SetAttr{Permissions: 0777} got, err := SetGetAttr(s.file, valid, attr) - if s.ft == symlink { + if s.fileType == syscall.S_IFLNK { if err == nil { t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions) } @@ -345,7 +396,7 @@ func TestSetAttrSize(t *testing.T) { valid := p9.SetAttrMask{Size: true} attr := p9.SetAttr{Size: size} got, err := SetGetAttr(s.file, valid, attr) - if s.ft == symlink || s.ft == directory { + if s.fileType == syscall.S_IFLNK || s.fileType == syscall.S_IFDIR { if err == nil { t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions) } @@ -427,7 +478,7 @@ func TestLink(t *testing.T) { } err = dir.Link(s.file, linkFile) - if s.ft == directory { + if s.fileType == syscall.S_IFDIR { if err != syscall.EPERM { t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err) } @@ -485,7 +536,7 @@ func TestROMountPanics(t *testing.T) { } func TestWalkNotFound(t *testing.T) { - runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFDIR}, allConfs, func(t *testing.T, s state) { if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT { t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err) } @@ -506,7 +557,7 @@ func TestWalkDup(t *testing.T) { } func TestReaddir(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) { name := "dir" if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err) diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh deleted file mode 100755 index c49f988b8..000000000 --- a/scripts/benchmark.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# Copyright 2020 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -source $(dirname $0)/common.sh - -make load-all-images - -if [[ -z "${1:-}" ]]; then - target=$(query "attr(tags, manual, tests(//test/benchmarks/...))") -else - target="$1" -fi - -install_runsc_for_benchmarks benchmark - -echo $target -benchmark_runsc $target "${@:2}" diff --git a/scripts/common.sh b/scripts/common.sh index 36158654f..3ca699e4a 100755 --- a/scripts/common.sh +++ b/scripts/common.sh @@ -42,15 +42,6 @@ function test_runsc() { test --test_arg=--runtime=${RUNTIME} "$@" } -function benchmark_runsc() { - test_runsc -c opt \ - --nocache_test_results \ - --test_arg=-test.bench=. \ - --test_arg=-test.benchmem \ - --jobs=1 \ - "$@" -} - function install_runsc_for_test() { local -r test_name=$1 shift @@ -72,24 +63,6 @@ function install_runsc_for_test() { "$@" } -function install_runsc_for_benchmarks() { - local -r test_name=$1 - shift - if [[ -z "${test_name}" ]]; then - echo "Missing mandatory test name" - exit 1 - fi - - # Add test to the name, so it doesn't conflict with other runtimes. - set_runtime $(find_branch_name)_"${test_name}" - - # ${RUNSC_TEST_NAME} is set by tests (see dockerutil) to pass the test name - # down to the runtime. - install_runsc "${RUNTIME}" \ - --TESTONLY-test-name-env=RUNSC_TEST_NAME \ - "$@" -} - # Installs the runsc with given runtime name. set_runtime must have been called # to set runtime and logs location. function install_runsc() { diff --git a/scripts/docker_tests.sh b/scripts/docker_tests.sh index dce0a4085..07e9f3109 100755 --- a/scripts/docker_tests.sh +++ b/scripts/docker_tests.sh @@ -22,4 +22,6 @@ install_runsc_for_test docker test_runsc //test/image:image_test //test/e2e:integration_test install_runsc_for_test docker --vfs2 -test_runsc //test/image:image_test --test_filter=.*TestHelloWorld +IMAGE_FILTER="Hello|Httpd|Ruby|Stdio" +INTEGRATION_FILTER="LifeCycle|Pause|Connect|JobControl|Overlay|Exec|DirCreation/root" +test_runsc //test/e2e:integration_test //test/image:image_test --test_filter="${IMAGE_FILTER}|${INTEGRATION_FILTER}" diff --git a/test/benchmarks/README.md b/test/benchmarks/README.md index 9ff602cf1..d1bbabf6f 100644 --- a/test/benchmarks/README.md +++ b/test/benchmarks/README.md @@ -13,33 +13,51 @@ To run benchmarks you will need: * Docker installed (17.09.0 or greater). -The easiest way to run benchmarks is to use the script at -//scripts/benchmark.sh. +The easiest way to setup runsc for running benchmarks is to use the make file. +From the root directory: -If not using the script, you will need: +* Download images: `make load-all-images` +* Install runsc suitable for benchmarking, which should probably not have + strace or debug logs enabled. For example:`make configure RUNTIME=myrunsc + ARGS=--platform=kvm`. +* Restart docker: `sudo service docker restart` -* `runsc` configured with docker +You should now have a runtime with the following options configured in +`/etc/docker/daemon.json` -Note: benchmarks call the runtime by name. If docker can run it with -`--runtime=` flag, these tools should work. +``` +"myrunsc": { + "path": "/tmp/myrunsc/runsc", + "runtimeArgs": [ + "--debug-log", + "/tmp/bench/logs/runsc.log.%TEST%.%TIMESTAMP%.%COMMAND%", + "--platform=kvm" + ] + }, + +``` + +This runtime has been configured with a debugging off and strace logs off and is +using kvm for demonstration. ## Running benchmarks -The easiest way to run is with the script at //scripts/benchmarks.sh. The script -will run all benchmarks under //test/benchmarks if a target is not provided. +Given the runtime above runtime `myrunsc`, run benchmarks with the following: -```bash -./script/benchmarks.sh //path/to/target +``` +make sudo TARGETS=//path/to:target ARGS="--runtime=myrunsc -test.v \ + -test.bench=." OPTIONS="-c opt ``` -If you want to run benchmarks manually: - -* Run `make load-all-images` from `//` -* Run with: +For example, to run only the Iperf tests: -```bash -bazel test --test_arg=--runtime=RUNTIME -c opt --test_output=streamed --test_timeout=600 --test_arg=-test.bench=. --nocache_test_results //path/to/target ``` +make sudo TARGETS=//test/benchmarks/network:network_test \ + ARGS="--runtime=myrunsc -test.v -test.bench=Iperf" OPTIONS="-c opt" +``` + +Benchmarks are run with root as some benchmarks require root privileges to do +things like drop caches. ## Writing benchmarks @@ -69,6 +87,7 @@ var h harness.Harness func BenchmarkMyCoolOne(b *testing.B) { machine, err := h.GetMachine() // check err + defer machine.CleanUp() ctx := context.Background() container := machine.GetContainer(ctx, b) @@ -82,7 +101,7 @@ func BenchmarkMyCoolOne(b *testing.B) { Image: "benchmarks/my-cool-image", Env: []string{"MY_VAR=awesome"}, other options...see dockerutil - }, "sh", "-c", "echo MY_VAR" ...) + }, "sh", "-c", "echo MY_VAR") //check err b.StopTimer() @@ -107,12 +126,32 @@ Some notes on the above: flags, remote virtual machines (eventually), and other services. * Respect `b.N` in that users of the benchmark may want to "run for an hour" or something of the sort. -* Use the `b.ReportMetric` method to report custom metrics. +* Use the `b.ReportMetric()` method to report custom metrics. * Set the timer if time is useful for reporting. There isn't a way to turn off default metrics in testing.B (B/op, allocs/op, ns/op). * Take a look at dockerutil at //pkg/test/dockerutil to see all methods available from containers. The API is based on the "official" [docker API for golang](https://pkg.go.dev/mod/github.com/docker/docker). -* `harness.GetMachine` marks how many machines this tests needs. If you have a - client and server and to mark them as multiple machines, call it - `GetMachine` twice. +* `harness.GetMachine()` marks how many machines this tests needs. If you have + a client and server and to mark them as multiple machines, call + `harness.GetMachine()` twice. + +## Profiling + +For profiling, the runtime is required to have the `--profile` flag enabled. +This flag loosens seccomp filters so that the runtime can write profile data to +disk. This configuration is not recommended for production. + +* Install runsc with the `--profile` flag: `make configure RUNTIME=myrunsc + ARGS="--profile --platform=kvm --vfs2"`. The kvm and vfs2 flags are not + required, but are included for demonstration. +* Restart docker: `sudo service docker restart` + +To run and generate CPU profiles fs_test test run: + +``` +make sudo TARGETS=//test/benchmarks/fs:fs_test \ + ARGS="--runtime=myrunsc -test.v -test.bench=. --pprof-cpu" OPTIONS="-c opt" +``` + +Profiles would be at: `/tmp/profile/myrunsc/CONTAINERNAME/cpu.pprof` diff --git a/test/benchmarks/database/BUILD b/test/benchmarks/database/BUILD new file mode 100644 index 000000000..5e33465cd --- /dev/null +++ b/test/benchmarks/database/BUILD @@ -0,0 +1,28 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "database", + testonly = 1, + srcs = ["database.go"], + deps = ["//test/benchmarks/harness"], +) + +go_test( + name = "database_test", + size = "enormous", + srcs = [ + "redis_test.go", + ], + library = ":database", + tags = [ + # Requires docker and runsc to be configured before test runs. + "manual", + "local", + ], + deps = [ + "//pkg/test/dockerutil", + "//test/benchmarks/harness", + ], +) diff --git a/test/benchmarks/database/database.go b/test/benchmarks/database/database.go new file mode 100644 index 000000000..9eeb59f9a --- /dev/null +++ b/test/benchmarks/database/database.go @@ -0,0 +1,31 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package database holds benchmarks around database applications. +package database + +import ( + "os" + "testing" + + "gvisor.dev/gvisor/test/benchmarks/harness" +) + +var h harness.Harness + +// TestMain is the main method for package database. +func TestMain(m *testing.M) { + h.Init() + os.Exit(m.Run()) +} diff --git a/test/benchmarks/database/redis_test.go b/test/benchmarks/database/redis_test.go new file mode 100644 index 000000000..6d39f4d66 --- /dev/null +++ b/test/benchmarks/database/redis_test.go @@ -0,0 +1,197 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package database + +import ( + "context" + "fmt" + "regexp" + "strconv" + "strings" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/test/benchmarks/harness" +) + +// All possible operations from redis. Note: "ping" will +// run both PING_INLINE and PING_BUILD. +var operations []string = []string{ + "PING_INLINE", + "PING_BULK", + "SET", + "GET", + "INCR", + "LPUSH", + "RPUSH", + "LPOP", + "RPOP", + "SADD", + "HSET", + "SPOP", + "LRANGE_100", + "LRANGE_300", + "LRANGE_500", + "LRANGE_600", + "MSET", +} + +// BenchmarkRedis runs redis-benchmark against a redis instance and reports +// data in queries per second. Each is reported by named operation (e.g. LPUSH). +func BenchmarkRedis(b *testing.B) { + clientMachine, err := h.GetMachine() + if err != nil { + b.Fatalf("failed to get machine: %v", err) + } + defer clientMachine.CleanUp() + + serverMachine, err := h.GetMachine() + if err != nil { + b.Fatalf("failed to get machine: %v", err) + } + defer serverMachine.CleanUp() + + // Redis runs on port 6379 by default. + port := 6379 + ctx := context.Background() + + for _, operation := range operations { + b.Run(operation, func(b *testing.B) { + server := serverMachine.GetContainer(ctx, b) + defer server.CleanUp(ctx) + + // The redis docker container takes no arguments to run a redis server. + if err := server.Spawn(ctx, dockerutil.RunOpts{ + Image: "benchmarks/redis", + Ports: []int{port}, + }); err != nil { + b.Fatalf("failed to start redis server with: %v", err) + } + + if out, err := server.WaitForOutput(ctx, "Ready to accept connections", 3*time.Second); err != nil { + b.Fatalf("failed to start redis server: %v %s", err, out) + } + + ip, err := serverMachine.IPAddress() + if err != nil { + b.Fatal("failed to get IP from server: %v", err) + } + + serverPort, err := server.FindPort(ctx, port) + if err != nil { + b.Fatal("failed to get IP from server: %v", err) + } + + if err = harness.WaitUntilServing(ctx, clientMachine, ip, serverPort); err != nil { + b.Fatalf("failed to start redis with: %v", err) + } + + // runs redis benchmark -t operation for 100K requests against server. + cmd := strings.Split( + fmt.Sprintf("redis-benchmark --csv -t %s -h %s -p %d", operation, ip, serverPort), " ") + + // There is no -t PING_BULK for redis-benchmark, so adjust the command in that case. + // Note that "ping" will run both PING_INLINE and PING_BULK. + if operation == "PING_BULK" { + cmd = strings.Split( + fmt.Sprintf("redis-benchmark --csv -t ping -h %s -p %d", ip, serverPort), " ") + } + // Reset profiles and timer to begin the measurement. + server.RestartProfiles() + b.ResetTimer() + for i := 0; i < b.N; i++ { + client := clientMachine.GetNativeContainer(ctx, b) + defer client.CleanUp(ctx) + out, err := client.Run(ctx, dockerutil.RunOpts{ + Image: "benchmarks/redis", + }, cmd...) + if err != nil { + b.Fatalf("redis-benchmark failed with: %v", err) + } + + // Stop time while we parse results. + b.StopTimer() + result, err := parseOperation(operation, out) + if err != nil { + b.Fatalf("parsing result %s failed with err: %v", out, err) + } + b.ReportMetric(result, operation) // operations per second + b.StartTimer() + } + }) + } +} + +// parseOperation grabs the metric operations per second from redis-benchmark output. +func parseOperation(operation, data string) (float64, error) { + re := regexp.MustCompile(fmt.Sprintf(`"%s( .*)?","(\d*\.\d*)"`, operation)) + match := re.FindStringSubmatch(data) + // If no match, simply don't add it to the result map. + if len(match) < 3 { + return 0.0, fmt.Errorf("could not find %s in %s", operation, data) + } + return strconv.ParseFloat(match[2], 64) +} + +// TestParser tests the parser on sample data. +func TestParser(t *testing.T) { + sampleData := ` + "PING_INLINE","48661.80" + "PING_BULK","50301.81" + "SET","48923.68" + "GET","49382.71" + "INCR","49975.02" + "LPUSH","49875.31" + "RPUSH","50276.52" + "LPOP","50327.12" + "RPOP","50556.12" + "SADD","49504.95" + "HSET","49504.95" + "SPOP","50025.02" + "LPUSH (needed to benchmark LRANGE)","48875.86" + "LRANGE_100 (first 100 elements)","33955.86" + "LRANGE_300 (first 300 elements)","16550.81" + "LRANGE_500 (first 450 elements)","13653.74" + "LRANGE_600 (first 600 elements)","11219.57" + "MSET (10 keys)","44682.75" + ` + wants := map[string]float64{ + "PING_INLINE": 48661.80, + "PING_BULK": 50301.81, + "SET": 48923.68, + "GET": 49382.71, + "INCR": 49975.02, + "LPUSH": 49875.31, + "RPUSH": 50276.52, + "LPOP": 50327.12, + "RPOP": 50556.12, + "SADD": 49504.95, + "HSET": 49504.95, + "SPOP": 50025.02, + "LRANGE_100": 33955.86, + "LRANGE_300": 16550.81, + "LRANGE_500": 13653.74, + "LRANGE_600": 11219.57, + "MSET": 44682.75, + } + for op, want := range wants { + if got, err := parseOperation(op, sampleData); err != nil { + t.Fatalf("failed to parse %s: %v", op, err) + } else if want != got { + t.Fatalf("wanted %f for op %s, got %f", want, op, got) + } + } +} diff --git a/test/benchmarks/fs/bazel_test.go b/test/benchmarks/fs/bazel_test.go index fdcac1a7a..9b652fd43 100644 --- a/test/benchmarks/fs/bazel_test.go +++ b/test/benchmarks/fs/bazel_test.go @@ -15,6 +15,7 @@ package fs import ( "context" + "fmt" "strings" "testing" @@ -51,10 +52,10 @@ func BenchmarkABSL(b *testing.B) { workdir := "/abseil-cpp" - // Start a container. + // Start a container and sleep by an order of b.N. if err := container.Spawn(ctx, dockerutil.RunOpts{ Image: "benchmarks/absl", - }, "sleep", "1000"); err != nil { + }, "sleep", fmt.Sprintf("%d", 1000000)); err != nil { b.Fatalf("run failed with: %v", err) } @@ -67,15 +68,21 @@ func BenchmarkABSL(b *testing.B) { workdir = "/tmp" + workdir } - // Drop Caches. - if bm.clearCache { - if out, err := machine.RunCommand("/bin/sh -c sync; echo 3 > /proc/sys/vm/drop_caches"); err != nil { - b.Fatalf("failed to drop caches: %v %s", err, out) - } - } - + // Restart profiles after the copy. + container.RestartProfiles() b.ResetTimer() + // Drop Caches and bazel clean should happen inside the loop as we may use + // time options with b.N. (e.g. Run for an hour.) for i := 0; i < b.N; i++ { + b.StopTimer() + // Drop Caches for clear cache runs. + if bm.clearCache { + if out, err := machine.RunCommand("/bin/sh", "-c", "sync && sysctl vm.drop_caches=3"); err != nil { + b.Skipf("failed to drop caches: %v %s. You probably need root.", err, out) + } + } + b.StartTimer() + got, err := container.Exec(ctx, dockerutil.ExecOpts{ WorkDir: workdir, }, "bazel", "build", "-c", "opt", "absl/base/...") @@ -88,6 +95,13 @@ func BenchmarkABSL(b *testing.B) { if !strings.Contains(got, want) { b.Fatalf("string %s not in: %s", want, got) } + // Clean bazel in case we use b.N. + _, err = container.Exec(ctx, dockerutil.ExecOpts{ + WorkDir: workdir, + }, "bazel", "clean") + if err != nil { + b.Fatalf("build failed with: %v", err) + } b.StartTimer() } }) diff --git a/test/benchmarks/harness/machine.go b/test/benchmarks/harness/machine.go index 93c0db9ce..88e5e841b 100644 --- a/test/benchmarks/harness/machine.go +++ b/test/benchmarks/harness/machine.go @@ -25,9 +25,14 @@ import ( // Machine describes a real machine for use in benchmarks. type Machine interface { - // GetContainer gets a container from the machine, + // GetContainer gets a container from the machine. The container uses the + // runtime under test and is profiled if requested by flags. GetContainer(ctx context.Context, log testutil.Logger) *dockerutil.Container + // GetNativeContainer gets a native container from the machine. Native containers + // use runc by default and are not profiled. + GetNativeContainer(ctx context.Context, log testutil.Logger) *dockerutil.Container + // RunCommand runs cmd on this machine. RunCommand(cmd string, args ...string) (string, error) @@ -47,6 +52,11 @@ func (l *localMachine) GetContainer(ctx context.Context, logger testutil.Logger) return dockerutil.MakeContainer(ctx, logger) } +// GetContainer implements Machine.GetContainer for localMachine. +func (l *localMachine) GetNativeContainer(ctx context.Context, logger testutil.Logger) *dockerutil.Container { + return dockerutil.MakeNativeContainer(ctx, logger) +} + // RunCommand implements Machine.RunCommand for localMachine. func (l *localMachine) RunCommand(cmd string, args ...string) (string, error) { c := exec.Command(cmd, args...) diff --git a/test/benchmarks/harness/util.go b/test/benchmarks/harness/util.go index cc7de6426..bc551c582 100644 --- a/test/benchmarks/harness/util.go +++ b/test/benchmarks/harness/util.go @@ -27,12 +27,20 @@ import ( // IP:port. func WaitUntilServing(ctx context.Context, machine Machine, server net.IP, port int) error { var logger testutil.DefaultLogger = "netcat" - netcat := machine.GetContainer(ctx, logger) + netcat := machine.GetNativeContainer(ctx, logger) defer netcat.CleanUp(ctx) - cmd := fmt.Sprintf("while ! nc -zv %s %d; do true; done", server.String(), port) + cmd := fmt.Sprintf("while ! nc -zv %s %d; do true; done", server, port) _, err := netcat.Run(ctx, dockerutil.RunOpts{ Image: "packetdrill", }, "sh", "-c", cmd) return err } + +// DropCaches drops caches on the provided machine. Requires root. +func DropCaches(machine Machine) error { + if out, err := machine.RunCommand("/bin/sh", "-c", "sync | sysctl vm.drop_caches=3"); err != nil { + return fmt.Errorf("failed to drop caches: %v logs: %s", err, out) + } + return nil +} diff --git a/test/benchmarks/media/BUILD b/test/benchmarks/media/BUILD new file mode 100644 index 000000000..6c41fc4f6 --- /dev/null +++ b/test/benchmarks/media/BUILD @@ -0,0 +1,21 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "media", + testonly = 1, + srcs = ["media.go"], + deps = ["//test/benchmarks/harness"], +) + +go_test( + name = "media_test", + size = "large", + srcs = ["ffmpeg_test.go"], + library = ":media", + deps = [ + "//pkg/test/dockerutil", + "//test/benchmarks/harness", + ], +) diff --git a/test/benchmarks/media/ffmpeg_test.go b/test/benchmarks/media/ffmpeg_test.go new file mode 100644 index 000000000..bfcfbab80 --- /dev/null +++ b/test/benchmarks/media/ffmpeg_test.go @@ -0,0 +1,52 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package media + +import ( + "context" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/test/benchmarks/harness" +) + +// BenchmarkFfmpeg runs ffmpeg in a container and records runtime. +// BenchmarkFfmpeg should run as root to drop caches. +func BenchmarkFfmpeg(b *testing.B) { + machine, err := h.GetMachine() + if err != nil { + b.Fatalf("failed to get machine: %v", err) + } + defer machine.CleanUp() + + ctx := context.Background() + container := machine.GetContainer(ctx, b) + cmd := strings.Split("ffmpeg -i video.mp4 -c:v libx264 -preset veryslow output.mp4", " ") + + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() + if err := harness.DropCaches(machine); err != nil { + b.Skipf("failed to drop caches: %v. You probably need root.", err) + } + b.StartTimer() + + if _, err := container.Run(ctx, dockerutil.RunOpts{ + Image: "benchmarks/ffmpeg", + }, cmd...); err != nil { + b.Fatalf("failed to run container: %v", err) + } + } +} diff --git a/test/benchmarks/media/media.go b/test/benchmarks/media/media.go new file mode 100644 index 000000000..c7b35b758 --- /dev/null +++ b/test/benchmarks/media/media.go @@ -0,0 +1,31 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package media holds benchmarks around media processing applications. +package media + +import ( + "os" + "testing" + + "gvisor.dev/gvisor/test/benchmarks/harness" +) + +var h harness.Harness + +// TestMain is the main method for package media. +func TestMain(m *testing.M) { + h.Init() + os.Exit(m.Run()) +} diff --git a/test/benchmarks/network/BUILD b/test/benchmarks/network/BUILD index 16d267bc8..363041fb7 100644 --- a/test/benchmarks/network/BUILD +++ b/test/benchmarks/network/BUILD @@ -24,6 +24,7 @@ go_test( ], deps = [ "//pkg/test/dockerutil", + "//pkg/test/testutil", "//test/benchmarks/harness", ], ) diff --git a/test/benchmarks/network/httpd_test.go b/test/benchmarks/network/httpd_test.go index f9afdf15f..fe23ca949 100644 --- a/test/benchmarks/network/httpd_test.go +++ b/test/benchmarks/network/httpd_test.go @@ -52,12 +52,12 @@ func BenchmarkHttpdConcurrency(b *testing.B) { defer serverMachine.CleanUp() // The test iterates over client concurrency, so set other parameters. - requests := 1000 + requests := 10000 concurrency := []int{1, 5, 10, 25} doc := docs["10Kb"] for _, c := range concurrency { - b.Run(fmt.Sprintf("%dConcurrency", c), func(b *testing.B) { + b.Run(fmt.Sprintf("%d", c), func(b *testing.B) { runHttpd(b, clientMachine, serverMachine, doc, requests, c) }) } @@ -78,7 +78,7 @@ func BenchmarkHttpdDocSize(b *testing.B) { } defer serverMachine.CleanUp() - requests := 1000 + requests := 10000 concurrency := 1 for name, filename := range docs { @@ -129,7 +129,7 @@ func runHttpd(b *testing.B, clientMachine, serverMachine harness.Machine, doc st harness.WaitUntilServing(ctx, clientMachine, ip, servingPort) // Grab a client. - client := clientMachine.GetContainer(ctx, b) + client := clientMachine.GetNativeContainer(ctx, b) defer client.CleanUp(ctx) path := fmt.Sprintf("http://%s:%d/%s", ip, servingPort, doc) @@ -137,6 +137,7 @@ func runHttpd(b *testing.B, clientMachine, serverMachine harness.Machine, doc st cmd = fmt.Sprintf("ab -n %d -c %d %s", requests, concurrency, path) b.ResetTimer() + server.RestartProfiles() for i := 0; i < b.N; i++ { out, err := client.Run(ctx, dockerutil.RunOpts{ Image: "benchmarks/ab", diff --git a/test/benchmarks/network/iperf_test.go b/test/benchmarks/network/iperf_test.go index 664e0797e..a5e198e14 100644 --- a/test/benchmarks/network/iperf_test.go +++ b/test/benchmarks/network/iperf_test.go @@ -22,12 +22,13 @@ import ( "testing" "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/test/benchmarks/harness" ) func BenchmarkIperf(b *testing.B) { + const time = 10 // time in seconds to run the client. - // Get two machines clientMachine, err := h.GetMachine() if err != nil { b.Fatalf("failed to get machine: %v", err) @@ -39,30 +40,32 @@ func BenchmarkIperf(b *testing.B) { b.Fatalf("failed to get machine: %v", err) } defer serverMachine.CleanUp() - + ctx := context.Background() for _, bm := range []struct { - name string - clientRuntime string - serverRuntime string + name string + clientFunc func(context.Context, testutil.Logger) *dockerutil.Container + serverFunc func(context.Context, testutil.Logger) *dockerutil.Container }{ // We are either measuring the server or the client. The other should be // runc. e.g. Upload sees how fast the runtime under test uploads to a native // server. - {name: "Upload", clientRuntime: dockerutil.Runtime(), serverRuntime: "runc"}, - {name: "Download", clientRuntime: "runc", serverRuntime: dockerutil.Runtime()}, + { + name: "Upload", + clientFunc: clientMachine.GetContainer, + serverFunc: serverMachine.GetNativeContainer, + }, + { + name: "Download", + clientFunc: clientMachine.GetNativeContainer, + serverFunc: serverMachine.GetContainer, + }, } { b.Run(bm.name, func(b *testing.B) { - - // Get a container from the server and set its runtime. - ctx := context.Background() - server := serverMachine.GetContainer(ctx, b) + // Set up the containers. + server := bm.serverFunc(ctx, b) defer server.CleanUp(ctx) - server.Runtime = bm.serverRuntime - - // Get a container from the client and set its runtime. - client := clientMachine.GetContainer(ctx, b) + client := bm.clientFunc(ctx, b) defer client.CleanUp(ctx) - client.Runtime = bm.clientRuntime // iperf serves on port 5001 by default. port := 5001 @@ -91,11 +94,14 @@ func BenchmarkIperf(b *testing.B) { } // iperf report in Kb realtime - cmd := fmt.Sprintf("iperf -f K --realtime -c %s -p %d", ip.String(), servingPort) + cmd := fmt.Sprintf("iperf -f K --realtime --time %d -c %s -p %d", time, ip.String(), servingPort) // Run the client. b.ResetTimer() + // Restart the server profiles. If the server isn't being profiled + // this does nothing. + server.RestartProfiles() for i := 0; i < b.N; i++ { out, err := client.Run(ctx, dockerutil.RunOpts{ Image: "benchmarks/iperf", diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go index 5a9455b33..ef42b689a 100644 --- a/test/e2e/integration_test.go +++ b/test/e2e/integration_test.go @@ -40,6 +40,9 @@ import ( "gvisor.dev/gvisor/pkg/test/testutil" ) +// defaultWait is the default wait time used for tests. +const defaultWait = time.Minute + // httpRequestSucceeds sends a request to a given url and checks that the status is OK. func httpRequestSucceeds(client http.Client, server string, port int) error { url := fmt.Sprintf("http://%s:%d", server, port) @@ -76,10 +79,10 @@ func TestLifeCycle(t *testing.T) { if err != nil { t.Fatalf("docker.FindPort(80) failed: %v", err) } - if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { + if err := testutil.WaitForHTTP(port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } - client := http.Client{Timeout: time.Duration(2 * time.Second)} + client := http.Client{Timeout: defaultWait} if err := httpRequestSucceeds(client, "localhost", port); err != nil { t.Errorf("http request failed: %v", err) } @@ -116,12 +119,12 @@ func TestPauseResume(t *testing.T) { } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { + if err := testutil.WaitForHTTP(port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check that container is working. - client := http.Client{Timeout: time.Duration(2 * time.Second)} + client := http.Client{Timeout: defaultWait} if err := httpRequestSucceeds(client, "localhost", port); err != nil { t.Error("http request failed:", err) } @@ -131,6 +134,7 @@ func TestPauseResume(t *testing.T) { } // Check if container is paused. + client = http.Client{Timeout: 10 * time.Millisecond} // Don't wait a minute. switch _, err := client.Get(fmt.Sprintf("http://localhost:%d", port)); v := err.(type) { case nil: t.Errorf("http req expected to fail but it succeeded") @@ -147,11 +151,12 @@ func TestPauseResume(t *testing.T) { } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { + if err := testutil.WaitForHTTP(port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check if container is working again. + client = http.Client{Timeout: defaultWait} if err := httpRequestSucceeds(client, "localhost", port); err != nil { t.Error("http request failed:", err) } @@ -178,12 +183,12 @@ func TestCheckpointRestore(t *testing.T) { if err := d.Checkpoint(ctx, "test"); err != nil { t.Fatalf("docker checkpoint failed: %v", err) } - if err := d.WaitTimeout(ctx, 30*time.Second); err != nil { + if err := d.WaitTimeout(ctx, defaultWait); err != nil { t.Fatalf("wait failed: %v", err) } // TODO(b/143498576): Remove Poll after github.com/moby/moby/issues/38963 is fixed. - if err := testutil.Poll(func() error { return d.Restore(ctx, "test") }, 15*time.Second); err != nil { + if err := testutil.Poll(func() error { return d.Restore(ctx, "test") }, defaultWait); err != nil { t.Fatalf("docker restore failed: %v", err) } @@ -194,12 +199,12 @@ func TestCheckpointRestore(t *testing.T) { } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { + if err := testutil.WaitForHTTP(port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check if container is working again. - client := http.Client{Timeout: time.Duration(2 * time.Second)} + client := http.Client{Timeout: defaultWait} if err := httpRequestSucceeds(client, "localhost", port); err != nil { t.Error("http request failed:", err) } @@ -236,7 +241,7 @@ func TestConnectToSelf(t *testing.T) { if want := "server\n"; reply != want { t.Errorf("Error on server, want: %q, got: %q", want, reply) } - if _, err := d.WaitForOutput(ctx, "^client\n$", 1*time.Second); err != nil { + if _, err := d.WaitForOutput(ctx, "^client\n$", defaultWait); err != nil { t.Fatalf("docker.WaitForOutput(client) timeout: %v", err) } } @@ -375,7 +380,7 @@ func TestTmpFile(t *testing.T) { d := dockerutil.MakeContainer(ctx, t) defer d.CleanUp(ctx) - opts := dockerutil.RunOpts{Image: "tmpfile"} + opts := dockerutil.RunOpts{Image: "basic/tmpfile"} got, err := d.Run(ctx, opts, "cat", "/tmp/foo/file.txt") if err != nil { t.Fatalf("docker run failed: %v", err) @@ -427,7 +432,7 @@ func TestHostOverlayfsCopyUp(t *testing.T) { defer d.CleanUp(ctx) if _, err := d.Run(ctx, dockerutil.RunOpts{ - Image: "hostoverlaytest", + Image: "basic/hostoverlaytest", WorkDir: "/root", }, "./test"); err != nil { t.Fatalf("docker run failed: %v", err) diff --git a/test/image/image_test.go b/test/image/image_test.go index 8aa78035f..ac6186688 100644 --- a/test/image/image_test.go +++ b/test/image/image_test.go @@ -37,6 +37,13 @@ import ( "gvisor.dev/gvisor/pkg/test/testutil" ) +// defaultWait defines how long to wait for progress. +// +// See BUILD: This is at least a "large" test, so allow up to 1 minute for any +// given "wait" step. Note that all tests are run in parallel, which may cause +// individual slow-downs (but a huge speed-up in aggregate). +const defaultWait = time.Minute + func TestHelloWorld(t *testing.T) { ctx := context.Background() d := dockerutil.MakeContainer(ctx, t) @@ -130,7 +137,7 @@ func TestHttpd(t *testing.T) { } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { + if err := testutil.WaitForHTTP(port, defaultWait); err != nil { t.Errorf("WaitForHTTP() timeout: %v", err) } @@ -159,7 +166,7 @@ func TestNginx(t *testing.T) { } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { + if err := testutil.WaitForHTTP(port, defaultWait); err != nil { t.Errorf("WaitForHTTP() timeout: %v", err) } @@ -180,7 +187,7 @@ func TestMysql(t *testing.T) { } // Wait until it's up and running. - if _, err := server.WaitForOutput(ctx, "port: 3306 MySQL Community Server", 3*time.Minute); err != nil { + if _, err := server.WaitForOutput(ctx, "port: 3306 MySQL Community Server", defaultWait); err != nil { t.Fatalf("WaitForOutput() timeout: %v", err) } @@ -200,7 +207,7 @@ func TestMysql(t *testing.T) { } // Ensure file executed to the end and shutdown mysql. - if _, err := server.WaitForOutput(ctx, "mysqld: Shutdown complete", 30*time.Second); err != nil { + if _, err := server.WaitForOutput(ctx, "mysqld: Shutdown complete", defaultWait); err != nil { t.Fatalf("WaitForOutput() timeout: %v", err) } } @@ -225,7 +232,7 @@ func TestTomcat(t *testing.T) { } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { + if err := testutil.WaitForHTTP(port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } @@ -262,7 +269,7 @@ func TestRuby(t *testing.T) { } // Wait until it's up and running, 'gem install' can take some time. - if err := testutil.WaitForHTTP(port, 1*time.Minute); err != nil { + if err := testutil.WaitForHTTP(port, time.Minute); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } @@ -299,7 +306,7 @@ func TestStdio(t *testing.T) { } for _, want := range []string{wantStdout, wantStderr} { - if _, err := d.WaitForOutput(ctx, want, 5*time.Second); err != nil { + if _, err := d.WaitForOutput(ctx, want, defaultWait); err != nil { t.Fatalf("docker didn't get output %q : %v", want, err) } } diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go index 068f228bd..af4355ba8 100644 --- a/test/iptables/filter_input.go +++ b/test/iptables/filter_input.go @@ -81,7 +81,7 @@ func (FilterInputDropUDP) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputDropUDP) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, dropPort, sendloopDuration) + return spawnUDPLoop(ip, dropPort, sendloopDuration) } // FilterInputDropOnlyUDP tests that "-p udp -j DROP" only affects UDP traffic. @@ -141,7 +141,7 @@ func (FilterInputDropUDPPort) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputDropUDPPort) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, dropPort, sendloopDuration) + return spawnUDPLoop(ip, dropPort, sendloopDuration) } // FilterInputDropDifferentUDPPort tests that dropping traffic for a single UDP port @@ -169,7 +169,7 @@ func (FilterInputDropDifferentUDPPort) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputDropDifferentUDPPort) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // FilterInputDropTCPDestPort tests that connections are not accepted on specified source ports. @@ -269,7 +269,7 @@ func (FilterInputDropAll) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputDropAll) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, dropPort, sendloopDuration) + return spawnUDPLoop(ip, dropPort, sendloopDuration) } // FilterInputMultiUDPRules verifies that multiple UDP rules are applied @@ -365,7 +365,7 @@ func (FilterInputDefaultPolicyAccept) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputDefaultPolicyAccept) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // FilterInputDefaultPolicyDrop tests the default DROP policy. @@ -396,7 +396,7 @@ func (FilterInputDefaultPolicyDrop) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputDefaultPolicyDrop) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // FilterInputReturnUnderflow tests that -j RETURN in a built-in chain causes @@ -428,7 +428,7 @@ func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputReturnUnderflow) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // FilterInputSerializeJump verifies that we can serialize jumps. @@ -482,7 +482,7 @@ func (FilterInputJumpBasic) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputJumpBasic) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // FilterInputJumpReturn jumps, returns, and executes a rule. @@ -512,7 +512,7 @@ func (FilterInputJumpReturn) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputJumpReturn) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // FilterInputJumpReturnDrop jumps to a chain, returns, and DROPs packets. @@ -549,7 +549,7 @@ func (FilterInputJumpReturnDrop) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputJumpReturnDrop) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, dropPort, sendloopDuration) + return spawnUDPLoop(ip, dropPort, sendloopDuration) } // FilterInputJumpBuiltin verifies that jumping to a top-levl chain is illegal. @@ -604,7 +604,7 @@ func (FilterInputJumpTwice) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputJumpTwice) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // FilterInputDestination verifies that we can filter packets via `-d @@ -638,7 +638,7 @@ func (FilterInputDestination) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputDestination) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // FilterInputInvertDestination verifies that we can filter packets via `! -d @@ -667,7 +667,7 @@ func (FilterInputInvertDestination) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputInvertDestination) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // FilterInputSource verifies that we can filter packets via `-s @@ -696,7 +696,7 @@ func (FilterInputSource) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputSource) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // FilterInputInvertSource verifies that we can filter packets via `! -s @@ -725,5 +725,5 @@ func (FilterInputInvertSource) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (FilterInputInvertSource) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go index f5ac79370..f303030aa 100644 --- a/test/iptables/iptables_test.go +++ b/test/iptables/iptables_test.go @@ -263,6 +263,13 @@ func TestNATPreRedirectTCPPort(t *testing.T) { singleTest(t, NATPreRedirectTCPPort{}) } +func TestNATPreRedirectTCPOutgoing(t *testing.T) { + singleTest(t, NATPreRedirectTCPOutgoing{}) +} + +func TestNATOutRedirectTCPIncoming(t *testing.T) { + singleTest(t, NATOutRedirectTCPIncoming{}) +} func TestNATOutRedirectUDPPort(t *testing.T) { singleTest(t, NATOutRedirectUDPPort{}) } diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go index d4bc55b24..174694002 100644 --- a/test/iptables/iptables_util.go +++ b/test/iptables/iptables_util.go @@ -84,17 +84,42 @@ func listenUDP(port int, timeout time.Duration) error { // sendUDPLoop sends 1 byte UDP packets repeatedly to the IP and port specified // over a duration. func sendUDPLoop(ip net.IP, port int, duration time.Duration) error { - // Send packets for a few seconds. + conn, err := connectUDP(ip, port) + if err != nil { + return err + } + defer conn.Close() + loopUDP(conn, duration) + return nil +} + +// spawnUDPLoop works like sendUDPLoop, but returns immediately and sends +// packets in another goroutine. +func spawnUDPLoop(ip net.IP, port int, duration time.Duration) error { + conn, err := connectUDP(ip, port) + if err != nil { + return err + } + go func() { + defer conn.Close() + loopUDP(conn, duration) + }() + return nil +} + +func connectUDP(ip net.IP, port int) (net.Conn, error) { remote := net.UDPAddr{ IP: ip, Port: port, } conn, err := net.DialUDP(network, nil, &remote) if err != nil { - return err + return nil, err } - defer conn.Close() + return conn, nil +} +func loopUDP(conn net.Conn, duration time.Duration) { to := time.After(duration) for timedOut := false; !timedOut; { // This may return an error (connection refused) if the remote @@ -109,8 +134,6 @@ func sendUDPLoop(ip net.IP, port int, duration time.Duration) error { time.Sleep(200 * time.Millisecond) } } - - return nil } // listenTCP listens for connections on a TCP port. diff --git a/test/iptables/nat.go b/test/iptables/nat.go index 8562b0820..23288577d 100644 --- a/test/iptables/nat.go +++ b/test/iptables/nat.go @@ -28,6 +28,8 @@ const ( func init() { RegisterTestCase(NATPreRedirectUDPPort{}) RegisterTestCase(NATPreRedirectTCPPort{}) + RegisterTestCase(NATPreRedirectTCPOutgoing{}) + RegisterTestCase(NATOutRedirectTCPIncoming{}) RegisterTestCase(NATOutRedirectUDPPort{}) RegisterTestCase(NATOutRedirectTCPPort{}) RegisterTestCase(NATDropUDP{}) @@ -65,7 +67,7 @@ func (NATPreRedirectUDPPort) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (NATPreRedirectUDPPort) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // NATPreRedirectTCPPort tests that connections are redirected on specified ports. @@ -91,6 +93,56 @@ func (NATPreRedirectTCPPort) LocalAction(ip net.IP) error { return connectTCP(ip, dropPort, sendloopDuration) } +// NATPreRedirectTCPOutgoing verifies that outgoing TCP connections aren't +// affected by PREROUTING connection tracking. +type NATPreRedirectTCPOutgoing struct{} + +// Name implements TestCase.Name. +func (NATPreRedirectTCPOutgoing) Name() string { + return "NATPreRedirectTCPOutgoing" +} + +// ContainerAction implements TestCase.ContainerAction. +func (NATPreRedirectTCPOutgoing) ContainerAction(ip net.IP) error { + // Redirect all incoming TCP traffic to a closed port. + if err := natTable("-A", "PREROUTING", "-p", "tcp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", dropPort)); err != nil { + return err + } + + // Establish a connection to the host process. + return connectTCP(ip, acceptPort, sendloopDuration) +} + +// LocalAction implements TestCase.LocalAction. +func (NATPreRedirectTCPOutgoing) LocalAction(ip net.IP) error { + return listenTCP(acceptPort, sendloopDuration) +} + +// NATOutRedirectTCPIncoming verifies that incoming TCP connections aren't +// affected by OUTPUT connection tracking. +type NATOutRedirectTCPIncoming struct{} + +// Name implements TestCase.Name. +func (NATOutRedirectTCPIncoming) Name() string { + return "NATOutRedirectTCPIncoming" +} + +// ContainerAction implements TestCase.ContainerAction. +func (NATOutRedirectTCPIncoming) ContainerAction(ip net.IP) error { + // Redirect all outgoing TCP traffic to a closed port. + if err := natTable("-A", "OUTPUT", "-p", "tcp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", dropPort)); err != nil { + return err + } + + // Establish a connection to the host process. + return listenTCP(acceptPort, sendloopDuration) +} + +// LocalAction implements TestCase.LocalAction. +func (NATOutRedirectTCPIncoming) LocalAction(ip net.IP) error { + return connectTCP(ip, acceptPort, sendloopDuration) +} + // NATOutRedirectUDPPort tests that packets are redirected to different port. type NATOutRedirectUDPPort struct{} @@ -135,7 +187,7 @@ func (NATDropUDP) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (NATDropUDP) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // NATAcceptAll tests that all UDP packets are accepted. @@ -161,7 +213,7 @@ func (NATAcceptAll) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (NATAcceptAll) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // NATOutRedirectIP uses iptables to select packets based on destination IP and @@ -258,7 +310,7 @@ func (NATPreRedirectIP) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (NATPreRedirectIP) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, dropPort, sendloopDuration) + return spawnUDPLoop(ip, dropPort, sendloopDuration) } // NATPreDontRedirectIP tests that iptables matching with "-d" does not match @@ -280,7 +332,7 @@ func (NATPreDontRedirectIP) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (NATPreDontRedirectIP) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, acceptPort, sendloopDuration) + return spawnUDPLoop(ip, acceptPort, sendloopDuration) } // NATPreRedirectInvert tests that iptables can match with "! -d". @@ -301,7 +353,7 @@ func (NATPreRedirectInvert) ContainerAction(ip net.IP) error { // LocalAction implements TestCase.LocalAction. func (NATPreRedirectInvert) LocalAction(ip net.IP) error { - return sendUDPLoop(ip, dropPort, sendloopDuration) + return spawnUDPLoop(ip, dropPort, sendloopDuration) } // NATRedirectRequiresProtocol tests that use of the --to-ports flag requires a diff --git a/test/packetdrill/defs.bzl b/test/packetdrill/defs.bzl index f499c177b..fc28ce9ba 100644 --- a/test/packetdrill/defs.bzl +++ b/test/packetdrill/defs.bzl @@ -26,7 +26,7 @@ def _packetdrill_test_impl(ctx): transitive_files = depset() if hasattr(ctx.attr._test_runner, "data_runfiles"): - transitive_files = depset(ctx.attr._test_runner.data_runfiles.files) + transitive_files = ctx.attr._test_runner.data_runfiles.files runfiles = ctx.runfiles( files = [test_runner] + ctx.files._init_script + ctx.files.scripts, transitive_files = transitive_files, @@ -60,11 +60,15 @@ _packetdrill_test = rule( implementation = _packetdrill_test_impl, ) -_PACKETDRILL_TAGS = ["local", "manual"] +PACKETDRILL_TAGS = [ + "local", + "manual", + "packetdrill", +] def packetdrill_linux_test(name, **kwargs): if "tags" not in kwargs: - kwargs["tags"] = _PACKETDRILL_TAGS + kwargs["tags"] = PACKETDRILL_TAGS _packetdrill_test( name = name, flags = ["--dut_platform", "linux"], @@ -73,7 +77,7 @@ def packetdrill_linux_test(name, **kwargs): def packetdrill_netstack_test(name, **kwargs): if "tags" not in kwargs: - kwargs["tags"] = _PACKETDRILL_TAGS + kwargs["tags"] = PACKETDRILL_TAGS _packetdrill_test( name = name, # This is the default runtime unless diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl index 77cdfea12..79b3c9162 100644 --- a/test/packetimpact/runner/defs.bzl +++ b/test/packetimpact/runner/defs.bzl @@ -55,7 +55,11 @@ _packetimpact_test = rule( implementation = _packetimpact_test_impl, ) -PACKETIMPACT_TAGS = ["local", "manual"] +PACKETIMPACT_TAGS = [ + "local", + "manual", + "packetimpact", +] def packetimpact_linux_test( name, @@ -75,7 +79,7 @@ def packetimpact_linux_test( name = name + "_linux_test", testbench_binary = testbench_binary, flags = ["--dut_platform", "linux"] + expect_failure_flag, - tags = PACKETIMPACT_TAGS + ["packetimpact"], + tags = PACKETIMPACT_TAGS, **kwargs ) @@ -101,7 +105,7 @@ def packetimpact_netstack_test( # This is the default runtime unless # "--test_arg=--runtime=OTHER_RUNTIME" is used to override the value. flags = ["--dut_platform", "netstack", "--runtime=runsc-d"] + expect_failure_flag, - tags = PACKETIMPACT_TAGS + ["packetimpact"], + tags = PACKETIMPACT_TAGS, **kwargs ) @@ -121,7 +125,10 @@ def packetimpact_go_test(name, size = "small", pure = True, expect_linux_failure name = testbench_binary, size = size, pure = pure, - tags = PACKETIMPACT_TAGS, + tags = [ + "local", + "manual", + ], **kwargs ) packetimpact_linux_test( diff --git a/test/packetimpact/runner/packetimpact_test.go b/test/packetimpact/runner/packetimpact_test.go index ff5f5c7f1..74e1e6def 100644 --- a/test/packetimpact/runner/packetimpact_test.go +++ b/test/packetimpact/runner/packetimpact_test.go @@ -142,7 +142,7 @@ func TestOne(t *testing.T) { // Create the Docker container for the DUT. dut := dockerutil.MakeContainer(ctx, logger("dut")) if *dutPlatform == "linux" { - dut.Runtime = "" + dut = dockerutil.MakeNativeContainer(ctx, logger("dut")) } runOpts := dockerutil.RunOpts{ @@ -208,8 +208,7 @@ func TestOne(t *testing.T) { } // Create the Docker container for the testbench. - testbench := dockerutil.MakeContainer(ctx, logger("testbench")) - testbench.Runtime = "" // The testbench always runs on Linux. + testbench := dockerutil.MakeNativeContainer(ctx, logger("testbench")) tbb := path.Base(*testbenchBinary) containerTestbenchBinary := "/packetimpact/" + tbb @@ -280,11 +279,13 @@ func TestOne(t *testing.T) { } // Because the Linux kernel receives the SYN-ACK but didn't send the SYN it - // will issue a RST. To prevent this IPtables can be used to filter out all + // will issue an RST. To prevent this IPtables can be used to filter out all // incoming packets. The raw socket that packetimpact tests use will still see // everything. - if logs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, "iptables", "-A", "INPUT", "-i", testNetDev, "-j", "DROP"); err != nil { - t.Fatalf("unable to Exec iptables on container %s: %s, logs from testbench:\n%s", testbench.Name, err, logs) + for _, bin := range []string{"iptables", "ip6tables"} { + if logs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, bin, "-A", "INPUT", "-i", testNetDev, "-p", "tcp", "-j", "DROP"); err != nil { + t.Fatalf("unable to Exec %s on container %s: %s, logs from testbench:\n%s", bin, testbench.Name, err, logs) + } } // FIXME(b/156449515): Some piece of the system has a race. The old diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go index 5d9cec73e..3af5f83fd 100644 --- a/test/packetimpact/testbench/connections.go +++ b/test/packetimpact/testbench/connections.go @@ -41,7 +41,8 @@ func portFromSockaddr(sa unix.Sockaddr) (uint16, error) { return 0, fmt.Errorf("sockaddr type %T does not contain port", sa) } -// pickPort makes a new socket and returns the socket FD and port. The domain should be AF_INET or AF_INET6. The caller must close the FD when done with +// pickPort makes a new socket and returns the socket FD and port. The domain +// should be AF_INET or AF_INET6. The caller must close the FD when done with // the port if there is no error. func pickPort(domain, typ int) (fd int, port uint16, err error) { fd, err = unix.Socket(domain, typ, 0) @@ -428,7 +429,6 @@ type Connection struct { layerStates []layerState injector Injector sniffer Sniffer - t *testing.T } // Returns the default incoming frame against which to match. If received is @@ -461,7 +461,9 @@ func (conn *Connection) match(override, received Layers) bool { } // Close frees associated resources held by the Connection. -func (conn *Connection) Close() { +func (conn *Connection) Close(t *testing.T) { + t.Helper() + errs := multierr.Combine(conn.sniffer.close(), conn.injector.close()) for _, s := range conn.layerStates { if err := s.close(); err != nil { @@ -469,7 +471,7 @@ func (conn *Connection) Close() { } } if errs != nil { - conn.t.Fatalf("unable to close %+v: %s", conn, errs) + t.Fatalf("unable to close %+v: %s", conn, errs) } } @@ -481,7 +483,9 @@ func (conn *Connection) Close() { // overriden first. As an example, valid values of overrideLayers for a TCP- // over-IPv4-over-Ethernet connection are: nil, [TCP], [IPv4, TCP], and // [Ethernet, IPv4, TCP]. -func (conn *Connection) CreateFrame(overrideLayers Layers, additionalLayers ...Layer) Layers { +func (conn *Connection) CreateFrame(t *testing.T, overrideLayers Layers, additionalLayers ...Layer) Layers { + t.Helper() + var layersToSend Layers for i, s := range conn.layerStates { layer := s.outgoing() @@ -490,7 +494,7 @@ func (conn *Connection) CreateFrame(overrideLayers Layers, additionalLayers ...L // end. if j := len(overrideLayers) - (len(conn.layerStates) - i); j >= 0 { if err := layer.merge(overrideLayers[j]); err != nil { - conn.t.Fatalf("can't merge %+v into %+v: %s", layer, overrideLayers[j], err) + t.Fatalf("can't merge %+v into %+v: %s", layer, overrideLayers[j], err) } } layersToSend = append(layersToSend, layer) @@ -504,21 +508,25 @@ func (conn *Connection) CreateFrame(overrideLayers Layers, additionalLayers ...L // This method is useful for sending out-of-band control messages such as // ICMP packets, where it would not make sense to update the transport layer's // state using the ICMP header. -func (conn *Connection) SendFrameStateless(frame Layers) { +func (conn *Connection) SendFrameStateless(t *testing.T, frame Layers) { + t.Helper() + outBytes, err := frame.ToBytes() if err != nil { - conn.t.Fatalf("can't build outgoing packet: %s", err) + t.Fatalf("can't build outgoing packet: %s", err) } - conn.injector.Send(outBytes) + conn.injector.Send(t, outBytes) } // SendFrame sends a frame on the wire and updates the state of all layers. -func (conn *Connection) SendFrame(frame Layers) { +func (conn *Connection) SendFrame(t *testing.T, frame Layers) { + t.Helper() + outBytes, err := frame.ToBytes() if err != nil { - conn.t.Fatalf("can't build outgoing packet: %s", err) + t.Fatalf("can't build outgoing packet: %s", err) } - conn.injector.Send(outBytes) + conn.injector.Send(t, outBytes) // frame might have nil values where the caller wanted to use default values. // sentFrame will have no nil values in it because it comes from parsing the @@ -527,7 +535,7 @@ func (conn *Connection) SendFrame(frame Layers) { // Update the state of each layer based on what was sent. for i, s := range conn.layerStates { if err := s.sent(sentFrame[i]); err != nil { - conn.t.Fatalf("Unable to update the state of %+v with %s: %s", s, sentFrame[i], err) + t.Fatalf("Unable to update the state of %+v with %s: %s", s, sentFrame[i], err) } } } @@ -537,18 +545,22 @@ func (conn *Connection) SendFrame(frame Layers) { // // Types defined with Connection as the underlying type should expose // type-safe versions of this method. -func (conn *Connection) send(overrideLayers Layers, additionalLayers ...Layer) { - conn.SendFrame(conn.CreateFrame(overrideLayers, additionalLayers...)) +func (conn *Connection) send(t *testing.T, overrideLayers Layers, additionalLayers ...Layer) { + t.Helper() + + conn.SendFrame(t, conn.CreateFrame(t, overrideLayers, additionalLayers...)) } // recvFrame gets the next successfully parsed frame (of type Layers) within the // timeout provided. If no parsable frame arrives before the timeout, it returns // nil. -func (conn *Connection) recvFrame(timeout time.Duration) Layers { +func (conn *Connection) recvFrame(t *testing.T, timeout time.Duration) Layers { + t.Helper() + if timeout <= 0 { return nil } - b := conn.sniffer.Recv(timeout) + b := conn.sniffer.Recv(t, timeout) if b == nil { return nil } @@ -568,32 +580,36 @@ func (e *layersError) Error() string { // Expect expects a frame with the final layerStates layer matching the // provided Layer within the timeout specified. If it doesn't arrive in time, // an error is returned. -func (conn *Connection) Expect(layer Layer, timeout time.Duration) (Layer, error) { +func (conn *Connection) Expect(t *testing.T, layer Layer, timeout time.Duration) (Layer, error) { + t.Helper() + // Make a frame that will ignore all but the final layer. layers := make([]Layer, len(conn.layerStates)) layers[len(layers)-1] = layer - gotFrame, err := conn.ExpectFrame(layers, timeout) + gotFrame, err := conn.ExpectFrame(t, layers, timeout) if err != nil { return nil, err } if len(conn.layerStates)-1 < len(gotFrame) { return gotFrame[len(conn.layerStates)-1], nil } - conn.t.Fatal("the received frame should be at least as long as the expected layers") + t.Fatalf("the received frame should be at least as long as the expected layers, got %d layers, want at least %d layers, got frame: %#v", len(gotFrame), len(conn.layerStates), gotFrame) panic("unreachable") } // ExpectFrame expects a frame that matches the provided Layers within the // timeout specified. If one arrives in time, the Layers is returned without an // error. If it doesn't arrive in time, it returns nil and error is non-nil. -func (conn *Connection) ExpectFrame(layers Layers, timeout time.Duration) (Layers, error) { +func (conn *Connection) ExpectFrame(t *testing.T, layers Layers, timeout time.Duration) (Layers, error) { + t.Helper() + deadline := time.Now().Add(timeout) var errs error for { var gotLayers Layers if timeout = time.Until(deadline); timeout > 0 { - gotLayers = conn.recvFrame(timeout) + gotLayers = conn.recvFrame(t, timeout) } if gotLayers == nil { if errs == nil { @@ -604,7 +620,7 @@ func (conn *Connection) ExpectFrame(layers Layers, timeout time.Duration) (Layer if conn.match(layers, gotLayers) { for i, s := range conn.layerStates { if err := s.received(gotLayers[i]); err != nil { - conn.t.Fatal(err) + t.Fatalf("failed to update test connection's layer states based on received frame: %s", err) } } return gotLayers, nil @@ -615,8 +631,10 @@ func (conn *Connection) ExpectFrame(layers Layers, timeout time.Duration) (Layer // Drain drains the sniffer's receive buffer by receiving packets until there's // nothing else to receive. -func (conn *Connection) Drain() { - conn.sniffer.Drain() +func (conn *Connection) Drain(t *testing.T) { + t.Helper() + + conn.sniffer.Drain(t) } // TCPIPv4 maintains the state for all the layers in a TCP/IPv4 connection. @@ -624,6 +642,8 @@ type TCPIPv4 Connection // NewTCPIPv4 creates a new TCPIPv4 connection with reasonable defaults. func NewTCPIPv4(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv4 { + t.Helper() + etherState, err := newEtherState(Ether{}, Ether{}) if err != nil { t.Fatalf("can't make etherState: %s", err) @@ -649,57 +669,58 @@ func NewTCPIPv4(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv4 { layerStates: []layerState{etherState, ipv4State, tcpState}, injector: injector, sniffer: sniffer, - t: t, } } // Connect performs a TCP 3-way handshake. The input Connection should have a // final TCP Layer. -func (conn *TCPIPv4) Connect() { - conn.t.Helper() +func (conn *TCPIPv4) Connect(t *testing.T) { + t.Helper() // Send the SYN. - conn.Send(TCP{Flags: Uint8(header.TCPFlagSyn)}) + conn.Send(t, TCP{Flags: Uint8(header.TCPFlagSyn)}) // Wait for the SYN-ACK. - synAck, err := conn.Expect(TCP{Flags: Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second) + synAck, err := conn.Expect(t, TCP{Flags: Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second) if err != nil { - conn.t.Fatalf("didn't get synack during handshake: %s", err) + t.Fatalf("didn't get synack during handshake: %s", err) } conn.layerStates[len(conn.layerStates)-1].(*tcpState).synAck = synAck // Send an ACK. - conn.Send(TCP{Flags: Uint8(header.TCPFlagAck)}) + conn.Send(t, TCP{Flags: Uint8(header.TCPFlagAck)}) } // ConnectWithOptions performs a TCP 3-way handshake with given TCP options. // The input Connection should have a final TCP Layer. -func (conn *TCPIPv4) ConnectWithOptions(options []byte) { - conn.t.Helper() +func (conn *TCPIPv4) ConnectWithOptions(t *testing.T, options []byte) { + t.Helper() // Send the SYN. - conn.Send(TCP{Flags: Uint8(header.TCPFlagSyn), Options: options}) + conn.Send(t, TCP{Flags: Uint8(header.TCPFlagSyn), Options: options}) // Wait for the SYN-ACK. - synAck, err := conn.Expect(TCP{Flags: Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second) + synAck, err := conn.Expect(t, TCP{Flags: Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second) if err != nil { - conn.t.Fatalf("didn't get synack during handshake: %s", err) + t.Fatalf("didn't get synack during handshake: %s", err) } conn.layerStates[len(conn.layerStates)-1].(*tcpState).synAck = synAck // Send an ACK. - conn.Send(TCP{Flags: Uint8(header.TCPFlagAck)}) + conn.Send(t, TCP{Flags: Uint8(header.TCPFlagAck)}) } // ExpectData is a convenient method that expects a Layer and the Layer after // it. If it doens't arrive in time, it returns nil. -func (conn *TCPIPv4) ExpectData(tcp *TCP, payload *Payload, timeout time.Duration) (Layers, error) { +func (conn *TCPIPv4) ExpectData(t *testing.T, tcp *TCP, payload *Payload, timeout time.Duration) (Layers, error) { + t.Helper() + expected := make([]Layer, len(conn.layerStates)) expected[len(expected)-1] = tcp if payload != nil { expected = append(expected, payload) } - return (*Connection)(conn).ExpectFrame(expected, timeout) + return (*Connection)(conn).ExpectFrame(t, expected, timeout) } // ExpectNextData attempts to receive the next incoming segment for the @@ -708,9 +729,11 @@ func (conn *TCPIPv4) ExpectData(tcp *TCP, payload *Payload, timeout time.Duratio // It differs from ExpectData() in that here we are only interested in the next // received segment, while ExpectData() can receive multiple segments for the // connection until there is a match with given layers or a timeout. -func (conn *TCPIPv4) ExpectNextData(tcp *TCP, payload *Payload, timeout time.Duration) (Layers, error) { +func (conn *TCPIPv4) ExpectNextData(t *testing.T, tcp *TCP, payload *Payload, timeout time.Duration) (Layers, error) { + t.Helper() + // Receive the first incoming TCP segment for this connection. - got, err := conn.ExpectData(&TCP{}, nil, timeout) + got, err := conn.ExpectData(t, &TCP{}, nil, timeout) if err != nil { return nil, err } @@ -719,7 +742,7 @@ func (conn *TCPIPv4) ExpectNextData(tcp *TCP, payload *Payload, timeout time.Dur expected[len(expected)-1] = tcp if payload != nil { expected = append(expected, payload) - tcp.SeqNum = Uint32(uint32(*conn.RemoteSeqNum()) - uint32(payload.Length())) + tcp.SeqNum = Uint32(uint32(*conn.RemoteSeqNum(t)) - uint32(payload.Length())) } if !(*Connection)(conn).match(expected, got) { return nil, fmt.Errorf("next frame is not matching %s during %s: got %s", expected, timeout, got) @@ -729,71 +752,91 @@ func (conn *TCPIPv4) ExpectNextData(tcp *TCP, payload *Payload, timeout time.Dur // Send a packet with reasonable defaults. Potentially override the TCP layer in // the connection with the provided layer and add additionLayers. -func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) { - (*Connection)(conn).send(Layers{&tcp}, additionalLayers...) +func (conn *TCPIPv4) Send(t *testing.T, tcp TCP, additionalLayers ...Layer) { + t.Helper() + + (*Connection)(conn).send(t, Layers{&tcp}, additionalLayers...) } // Close frees associated resources held by the TCPIPv4 connection. -func (conn *TCPIPv4) Close() { - (*Connection)(conn).Close() +func (conn *TCPIPv4) Close(t *testing.T) { + t.Helper() + + (*Connection)(conn).Close(t) } // Expect expects a frame with the TCP layer matching the provided TCP within // the timeout specified. If it doesn't arrive in time, an error is returned. -func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) (*TCP, error) { - layer, err := (*Connection)(conn).Expect(&tcp, timeout) +func (conn *TCPIPv4) Expect(t *testing.T, tcp TCP, timeout time.Duration) (*TCP, error) { + t.Helper() + + layer, err := (*Connection)(conn).Expect(t, &tcp, timeout) if layer == nil { return nil, err } gotTCP, ok := layer.(*TCP) if !ok { - conn.t.Fatalf("expected %s to be TCP", layer) + t.Fatalf("expected %s to be TCP", layer) } return gotTCP, err } -func (conn *TCPIPv4) tcpState() *tcpState { +func (conn *TCPIPv4) tcpState(t *testing.T) *tcpState { + t.Helper() + state, ok := conn.layerStates[2].(*tcpState) if !ok { - conn.t.Fatalf("got transport-layer state type=%T, expected tcpState", conn.layerStates[2]) + t.Fatalf("got transport-layer state type=%T, expected tcpState", conn.layerStates[2]) } return state } -func (conn *TCPIPv4) ipv4State() *ipv4State { +func (conn *TCPIPv4) ipv4State(t *testing.T) *ipv4State { + t.Helper() + state, ok := conn.layerStates[1].(*ipv4State) if !ok { - conn.t.Fatalf("expected network-layer state type=%T, expected ipv4State", conn.layerStates[1]) + t.Fatalf("expected network-layer state type=%T, expected ipv4State", conn.layerStates[1]) } return state } // RemoteSeqNum returns the next expected sequence number from the DUT. -func (conn *TCPIPv4) RemoteSeqNum() *seqnum.Value { - return conn.tcpState().remoteSeqNum +func (conn *TCPIPv4) RemoteSeqNum(t *testing.T) *seqnum.Value { + t.Helper() + + return conn.tcpState(t).remoteSeqNum } // LocalSeqNum returns the next sequence number to send from the testbench. -func (conn *TCPIPv4) LocalSeqNum() *seqnum.Value { - return conn.tcpState().localSeqNum +func (conn *TCPIPv4) LocalSeqNum(t *testing.T) *seqnum.Value { + t.Helper() + + return conn.tcpState(t).localSeqNum } // SynAck returns the SynAck that was part of the handshake. -func (conn *TCPIPv4) SynAck() *TCP { - return conn.tcpState().synAck +func (conn *TCPIPv4) SynAck(t *testing.T) *TCP { + t.Helper() + + return conn.tcpState(t).synAck } // LocalAddr gets the local socket address of this connection. -func (conn *TCPIPv4) LocalAddr() *unix.SockaddrInet4 { - sa := &unix.SockaddrInet4{Port: int(*conn.tcpState().out.SrcPort)} - copy(sa.Addr[:], *conn.ipv4State().out.SrcAddr) +func (conn *TCPIPv4) LocalAddr(t *testing.T) *unix.SockaddrInet4 { + t.Helper() + + sa := &unix.SockaddrInet4{Port: int(*conn.tcpState(t).out.SrcPort)} + copy(sa.Addr[:], *conn.ipv4State(t).out.SrcAddr) return sa } // Drain drains the sniffer's receive buffer by receiving packets until there's // nothing else to receive. -func (conn *TCPIPv4) Drain() { - conn.sniffer.Drain() +func (conn *TCPIPv4) Drain(t *testing.T) { + t.Helper() + + conn.sniffer.Drain(t) } // IPv6Conn maintains the state for all the layers in a IPv6 connection. @@ -801,6 +844,8 @@ type IPv6Conn Connection // NewIPv6Conn creates a new IPv6Conn connection with reasonable defaults. func NewIPv6Conn(t *testing.T, outgoingIPv6, incomingIPv6 IPv6) IPv6Conn { + t.Helper() + etherState, err := newEtherState(Ether{}, Ether{}) if err != nil { t.Fatalf("can't make EtherState: %s", err) @@ -823,25 +868,30 @@ func NewIPv6Conn(t *testing.T, outgoingIPv6, incomingIPv6 IPv6) IPv6Conn { layerStates: []layerState{etherState, ipv6State}, injector: injector, sniffer: sniffer, - t: t, } } // Send sends a frame with ipv6 overriding the IPv6 layer defaults and // additionalLayers added after it. -func (conn *IPv6Conn) Send(ipv6 IPv6, additionalLayers ...Layer) { - (*Connection)(conn).send(Layers{&ipv6}, additionalLayers...) +func (conn *IPv6Conn) Send(t *testing.T, ipv6 IPv6, additionalLayers ...Layer) { + t.Helper() + + (*Connection)(conn).send(t, Layers{&ipv6}, additionalLayers...) } // Close to clean up any resources held. -func (conn *IPv6Conn) Close() { - (*Connection)(conn).Close() +func (conn *IPv6Conn) Close(t *testing.T) { + t.Helper() + + (*Connection)(conn).Close(t) } // ExpectFrame expects a frame that matches the provided Layers within the // timeout specified. If it doesn't arrive in time, an error is returned. -func (conn *IPv6Conn) ExpectFrame(frame Layers, timeout time.Duration) (Layers, error) { - return (*Connection)(conn).ExpectFrame(frame, timeout) +func (conn *IPv6Conn) ExpectFrame(t *testing.T, frame Layers, timeout time.Duration) (Layers, error) { + t.Helper() + + return (*Connection)(conn).ExpectFrame(t, frame, timeout) } // UDPIPv4 maintains the state for all the layers in a UDP/IPv4 connection. @@ -849,6 +899,8 @@ type UDPIPv4 Connection // NewUDPIPv4 creates a new UDPIPv4 connection with reasonable defaults. func NewUDPIPv4(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv4 { + t.Helper() + etherState, err := newEtherState(Ether{}, Ether{}) if err != nil { t.Fatalf("can't make etherState: %s", err) @@ -874,81 +926,96 @@ func NewUDPIPv4(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv4 { layerStates: []layerState{etherState, ipv4State, udpState}, injector: injector, sniffer: sniffer, - t: t, } } -func (conn *UDPIPv4) udpState() *udpState { +func (conn *UDPIPv4) udpState(t *testing.T) *udpState { + t.Helper() + state, ok := conn.layerStates[2].(*udpState) if !ok { - conn.t.Fatalf("got transport-layer state type=%T, expected udpState", conn.layerStates[2]) + t.Fatalf("got transport-layer state type=%T, expected udpState", conn.layerStates[2]) } return state } -func (conn *UDPIPv4) ipv4State() *ipv4State { +func (conn *UDPIPv4) ipv4State(t *testing.T) *ipv4State { + t.Helper() + state, ok := conn.layerStates[1].(*ipv4State) if !ok { - conn.t.Fatalf("got network-layer state type=%T, expected ipv4State", conn.layerStates[1]) + t.Fatalf("got network-layer state type=%T, expected ipv4State", conn.layerStates[1]) } return state } // LocalAddr gets the local socket address of this connection. -func (conn *UDPIPv4) LocalAddr() *unix.SockaddrInet4 { - sa := &unix.SockaddrInet4{Port: int(*conn.udpState().out.SrcPort)} - copy(sa.Addr[:], *conn.ipv4State().out.SrcAddr) +func (conn *UDPIPv4) LocalAddr(t *testing.T) *unix.SockaddrInet4 { + t.Helper() + + sa := &unix.SockaddrInet4{Port: int(*conn.udpState(t).out.SrcPort)} + copy(sa.Addr[:], *conn.ipv4State(t).out.SrcAddr) return sa } // Send sends a packet with reasonable defaults, potentially overriding the UDP // layer and adding additionLayers. -func (conn *UDPIPv4) Send(udp UDP, additionalLayers ...Layer) { - (*Connection)(conn).send(Layers{&udp}, additionalLayers...) +func (conn *UDPIPv4) Send(t *testing.T, udp UDP, additionalLayers ...Layer) { + t.Helper() + + (*Connection)(conn).send(t, Layers{&udp}, additionalLayers...) } // SendIP sends a packet with reasonable defaults, potentially overriding the // UDP and IPv4 headers and adding additionLayers. -func (conn *UDPIPv4) SendIP(ip IPv4, udp UDP, additionalLayers ...Layer) { - (*Connection)(conn).send(Layers{&ip, &udp}, additionalLayers...) +func (conn *UDPIPv4) SendIP(t *testing.T, ip IPv4, udp UDP, additionalLayers ...Layer) { + t.Helper() + + (*Connection)(conn).send(t, Layers{&ip, &udp}, additionalLayers...) } // Expect expects a frame with the UDP layer matching the provided UDP within // the timeout specified. If it doesn't arrive in time, an error is returned. -func (conn *UDPIPv4) Expect(udp UDP, timeout time.Duration) (*UDP, error) { - conn.t.Helper() - layer, err := (*Connection)(conn).Expect(&udp, timeout) +func (conn *UDPIPv4) Expect(t *testing.T, udp UDP, timeout time.Duration) (*UDP, error) { + t.Helper() + + layer, err := (*Connection)(conn).Expect(t, &udp, timeout) if err != nil { return nil, err } gotUDP, ok := layer.(*UDP) if !ok { - conn.t.Fatalf("expected %s to be UDP", layer) + t.Fatalf("expected %s to be UDP", layer) } return gotUDP, nil } // ExpectData is a convenient method that expects a Layer and the Layer after // it. If it doens't arrive in time, it returns nil. -func (conn *UDPIPv4) ExpectData(udp UDP, payload Payload, timeout time.Duration) (Layers, error) { - conn.t.Helper() +func (conn *UDPIPv4) ExpectData(t *testing.T, udp UDP, payload Payload, timeout time.Duration) (Layers, error) { + t.Helper() + expected := make([]Layer, len(conn.layerStates)) expected[len(expected)-1] = &udp if payload.length() != 0 { expected = append(expected, &payload) } - return (*Connection)(conn).ExpectFrame(expected, timeout) + return (*Connection)(conn).ExpectFrame(t, expected, timeout) } // Close frees associated resources held by the UDPIPv4 connection. -func (conn *UDPIPv4) Close() { - (*Connection)(conn).Close() +func (conn *UDPIPv4) Close(t *testing.T) { + t.Helper() + + (*Connection)(conn).Close(t) } // Drain drains the sniffer's receive buffer by receiving packets until there's // nothing else to receive. -func (conn *UDPIPv4) Drain() { - conn.sniffer.Drain() +func (conn *UDPIPv4) Drain(t *testing.T) { + t.Helper() + + conn.sniffer.Drain(t) } // UDPIPv6 maintains the state for all the layers in a UDP/IPv6 connection. @@ -956,6 +1023,8 @@ type UDPIPv6 Connection // NewUDPIPv6 creates a new UDPIPv6 connection with reasonable defaults. func NewUDPIPv6(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv6 { + t.Helper() + etherState, err := newEtherState(Ether{}, Ether{}) if err != nil { t.Fatalf("can't make etherState: %s", err) @@ -980,84 +1049,157 @@ func NewUDPIPv6(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv6 { layerStates: []layerState{etherState, ipv6State, udpState}, injector: injector, sniffer: sniffer, - t: t, } } -func (conn *UDPIPv6) udpState() *udpState { +func (conn *UDPIPv6) udpState(t *testing.T) *udpState { + t.Helper() + state, ok := conn.layerStates[2].(*udpState) if !ok { - conn.t.Fatalf("got transport-layer state type=%T, expected udpState", conn.layerStates[2]) + t.Fatalf("got transport-layer state type=%T, expected udpState", conn.layerStates[2]) } return state } -func (conn *UDPIPv6) ipv6State() *ipv6State { +func (conn *UDPIPv6) ipv6State(t *testing.T) *ipv6State { + t.Helper() + state, ok := conn.layerStates[1].(*ipv6State) if !ok { - conn.t.Fatalf("got network-layer state type=%T, expected ipv6State", conn.layerStates[1]) + t.Fatalf("got network-layer state type=%T, expected ipv6State", conn.layerStates[1]) } return state } // LocalAddr gets the local socket address of this connection. -func (conn *UDPIPv6) LocalAddr() *unix.SockaddrInet6 { +func (conn *UDPIPv6) LocalAddr(t *testing.T) *unix.SockaddrInet6 { + t.Helper() + sa := &unix.SockaddrInet6{ - Port: int(*conn.udpState().out.SrcPort), + Port: int(*conn.udpState(t).out.SrcPort), // Local address is in perspective to the remote host, so it's scoped to the // ID of the remote interface. ZoneId: uint32(RemoteInterfaceID), } - copy(sa.Addr[:], *conn.ipv6State().out.SrcAddr) + copy(sa.Addr[:], *conn.ipv6State(t).out.SrcAddr) return sa } // Send sends a packet with reasonable defaults, potentially overriding the UDP // layer and adding additionLayers. -func (conn *UDPIPv6) Send(udp UDP, additionalLayers ...Layer) { - (*Connection)(conn).send(Layers{&udp}, additionalLayers...) +func (conn *UDPIPv6) Send(t *testing.T, udp UDP, additionalLayers ...Layer) { + t.Helper() + + (*Connection)(conn).send(t, Layers{&udp}, additionalLayers...) } // SendIPv6 sends a packet with reasonable defaults, potentially overriding the // UDP and IPv6 headers and adding additionLayers. -func (conn *UDPIPv6) SendIPv6(ip IPv6, udp UDP, additionalLayers ...Layer) { - (*Connection)(conn).send(Layers{&ip, &udp}, additionalLayers...) +func (conn *UDPIPv6) SendIPv6(t *testing.T, ip IPv6, udp UDP, additionalLayers ...Layer) { + t.Helper() + + (*Connection)(conn).send(t, Layers{&ip, &udp}, additionalLayers...) } // Expect expects a frame with the UDP layer matching the provided UDP within // the timeout specified. If it doesn't arrive in time, an error is returned. -func (conn *UDPIPv6) Expect(udp UDP, timeout time.Duration) (*UDP, error) { - conn.t.Helper() - layer, err := (*Connection)(conn).Expect(&udp, timeout) +func (conn *UDPIPv6) Expect(t *testing.T, udp UDP, timeout time.Duration) (*UDP, error) { + t.Helper() + + layer, err := (*Connection)(conn).Expect(t, &udp, timeout) if err != nil { return nil, err } gotUDP, ok := layer.(*UDP) if !ok { - conn.t.Fatalf("expected %s to be UDP", layer) + t.Fatalf("expected %s to be UDP", layer) } return gotUDP, nil } // ExpectData is a convenient method that expects a Layer and the Layer after // it. If it doens't arrive in time, it returns nil. -func (conn *UDPIPv6) ExpectData(udp UDP, payload Payload, timeout time.Duration) (Layers, error) { - conn.t.Helper() +func (conn *UDPIPv6) ExpectData(t *testing.T, udp UDP, payload Payload, timeout time.Duration) (Layers, error) { + t.Helper() + expected := make([]Layer, len(conn.layerStates)) expected[len(expected)-1] = &udp if payload.length() != 0 { expected = append(expected, &payload) } - return (*Connection)(conn).ExpectFrame(expected, timeout) + return (*Connection)(conn).ExpectFrame(t, expected, timeout) } // Close frees associated resources held by the UDPIPv6 connection. -func (conn *UDPIPv6) Close() { - (*Connection)(conn).Close() +func (conn *UDPIPv6) Close(t *testing.T) { + t.Helper() + + (*Connection)(conn).Close(t) } // Drain drains the sniffer's receive buffer by receiving packets until there's // nothing else to receive. -func (conn *UDPIPv6) Drain() { - conn.sniffer.Drain() +func (conn *UDPIPv6) Drain(t *testing.T) { + t.Helper() + + conn.sniffer.Drain(t) +} + +// TCPIPv6 maintains the state for all the layers in a TCP/IPv6 connection. +type TCPIPv6 Connection + +// NewTCPIPv6 creates a new TCPIPv6 connection with reasonable defaults. +func NewTCPIPv6(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv6 { + etherState, err := newEtherState(Ether{}, Ether{}) + if err != nil { + t.Fatalf("can't make etherState: %s", err) + } + ipv6State, err := newIPv6State(IPv6{}, IPv6{}) + if err != nil { + t.Fatalf("can't make ipv6State: %s", err) + } + tcpState, err := newTCPState(unix.AF_INET6, outgoingTCP, incomingTCP) + if err != nil { + t.Fatalf("can't make tcpState: %s", err) + } + injector, err := NewInjector(t) + if err != nil { + t.Fatalf("can't make injector: %s", err) + } + sniffer, err := NewSniffer(t) + if err != nil { + t.Fatalf("can't make sniffer: %s", err) + } + + return TCPIPv6{ + layerStates: []layerState{etherState, ipv6State, tcpState}, + injector: injector, + sniffer: sniffer, + } +} + +func (conn *TCPIPv6) SrcPort() uint16 { + state := conn.layerStates[2].(*tcpState) + return *state.out.SrcPort +} + +// ExpectData is a convenient method that expects a Layer and the Layer after +// it. If it doens't arrive in time, it returns nil. +func (conn *TCPIPv6) ExpectData(t *testing.T, tcp *TCP, payload *Payload, timeout time.Duration) (Layers, error) { + t.Helper() + + expected := make([]Layer, len(conn.layerStates)) + expected[len(expected)-1] = tcp + if payload != nil { + expected = append(expected, payload) + } + return (*Connection)(conn).ExpectFrame(t, expected, timeout) +} + +// Close frees associated resources held by the TCPIPv6 connection. +func (conn *TCPIPv6) Close(t *testing.T) { + t.Helper() + + (*Connection)(conn).Close(t) } diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go index 51be13759..73c532e75 100644 --- a/test/packetimpact/testbench/dut.go +++ b/test/packetimpact/testbench/dut.go @@ -31,13 +31,14 @@ import ( // DUT communicates with the DUT to force it to make POSIX calls. type DUT struct { - t *testing.T conn *grpc.ClientConn posixServer POSIXClient } // NewDUT creates a new connection with the DUT over gRPC. func NewDUT(t *testing.T) DUT { + t.Helper() + flag.Parse() if err := genPseudoFlags(); err != nil { t.Fatal("generating psuedo flags:", err) @@ -50,7 +51,6 @@ func NewDUT(t *testing.T) DUT { } posixServer := NewPOSIXClient(conn) return DUT{ - t: t, conn: conn, posixServer: posixServer, } @@ -61,8 +61,9 @@ func (dut *DUT) TearDown() { dut.conn.Close() } -func (dut *DUT) sockaddrToProto(sa unix.Sockaddr) *pb.Sockaddr { - dut.t.Helper() +func (dut *DUT) sockaddrToProto(t *testing.T, sa unix.Sockaddr) *pb.Sockaddr { + t.Helper() + switch s := sa.(type) { case *unix.SockaddrInet4: return &pb.Sockaddr{ @@ -87,12 +88,13 @@ func (dut *DUT) sockaddrToProto(sa unix.Sockaddr) *pb.Sockaddr { }, } } - dut.t.Fatalf("can't parse Sockaddr struct: %+v", sa) + t.Fatalf("can't parse Sockaddr struct: %+v", sa) return nil } -func (dut *DUT) protoToSockaddr(sa *pb.Sockaddr) unix.Sockaddr { - dut.t.Helper() +func (dut *DUT) protoToSockaddr(t *testing.T, sa *pb.Sockaddr) unix.Sockaddr { + t.Helper() + switch s := sa.Sockaddr.(type) { case *pb.Sockaddr_In: ret := unix.SockaddrInet4{ @@ -108,31 +110,32 @@ func (dut *DUT) protoToSockaddr(sa *pb.Sockaddr) unix.Sockaddr { copy(ret.Addr[:], s.In6.GetAddr()) return &ret } - dut.t.Fatalf("can't parse Sockaddr proto: %+v", sa) + t.Fatalf("can't parse Sockaddr proto: %#v", sa) return nil } // CreateBoundSocket makes a new socket on the DUT, with type typ and protocol // proto, and bound to the IP address addr. Returns the new file descriptor and // the port that was selected on the DUT. -func (dut *DUT) CreateBoundSocket(typ, proto int32, addr net.IP) (int32, uint16) { - dut.t.Helper() +func (dut *DUT) CreateBoundSocket(t *testing.T, typ, proto int32, addr net.IP) (int32, uint16) { + t.Helper() + var fd int32 if addr.To4() != nil { - fd = dut.Socket(unix.AF_INET, typ, proto) + fd = dut.Socket(t, unix.AF_INET, typ, proto) sa := unix.SockaddrInet4{} copy(sa.Addr[:], addr.To4()) - dut.Bind(fd, &sa) + dut.Bind(t, fd, &sa) } else if addr.To16() != nil { - fd = dut.Socket(unix.AF_INET6, typ, proto) + fd = dut.Socket(t, unix.AF_INET6, typ, proto) sa := unix.SockaddrInet6{} copy(sa.Addr[:], addr.To16()) sa.ZoneId = uint32(RemoteInterfaceID) - dut.Bind(fd, &sa) + dut.Bind(t, fd, &sa) } else { - dut.t.Fatalf("unknown ip addr type for remoteIP") + t.Fatalf("invalid IP address: %s", addr) } - sa := dut.GetSockName(fd) + sa := dut.GetSockName(t, fd) var port int switch s := sa.(type) { case *unix.SockaddrInet4: @@ -140,15 +143,17 @@ func (dut *DUT) CreateBoundSocket(typ, proto int32, addr net.IP) (int32, uint16) case *unix.SockaddrInet6: port = s.Port default: - dut.t.Fatalf("unknown sockaddr type from getsockname: %t", sa) + t.Fatalf("unknown sockaddr type from getsockname: %T", sa) } return fd, uint16(port) } // CreateListener makes a new TCP connection. If it fails, the test ends. -func (dut *DUT) CreateListener(typ, proto, backlog int32) (int32, uint16) { - fd, remotePort := dut.CreateBoundSocket(typ, proto, net.ParseIP(RemoteIPv4)) - dut.Listen(fd, backlog) +func (dut *DUT) CreateListener(t *testing.T, typ, proto, backlog int32) (int32, uint16) { + t.Helper() + + fd, remotePort := dut.CreateBoundSocket(t, typ, proto, net.ParseIP(RemoteIPv4)) + dut.Listen(t, fd, backlog) return fd, remotePort } @@ -158,53 +163,57 @@ func (dut *DUT) CreateListener(typ, proto, backlog int32) (int32, uint16) { // Accept calls accept on the DUT and causes a fatal test failure if it doesn't // succeed. If more control over the timeout or error handling is needed, use // AcceptWithErrno. -func (dut *DUT) Accept(sockfd int32) (int32, unix.Sockaddr) { - dut.t.Helper() +func (dut *DUT) Accept(t *testing.T, sockfd int32) (int32, unix.Sockaddr) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - fd, sa, err := dut.AcceptWithErrno(ctx, sockfd) + fd, sa, err := dut.AcceptWithErrno(ctx, t, sockfd) if fd < 0 { - dut.t.Fatalf("failed to accept: %s", err) + t.Fatalf("failed to accept: %s", err) } return fd, sa } // AcceptWithErrno calls accept on the DUT. -func (dut *DUT) AcceptWithErrno(ctx context.Context, sockfd int32) (int32, unix.Sockaddr, error) { - dut.t.Helper() +func (dut *DUT) AcceptWithErrno(ctx context.Context, t *testing.T, sockfd int32) (int32, unix.Sockaddr, error) { + t.Helper() + req := pb.AcceptRequest{ Sockfd: sockfd, } resp, err := dut.posixServer.Accept(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call Accept: %s", err) + t.Fatalf("failed to call Accept: %s", err) } - return resp.GetFd(), dut.protoToSockaddr(resp.GetAddr()), syscall.Errno(resp.GetErrno_()) + return resp.GetFd(), dut.protoToSockaddr(t, resp.GetAddr()), syscall.Errno(resp.GetErrno_()) } // Bind calls bind on the DUT and causes a fatal test failure if it doesn't // succeed. If more control over the timeout or error handling is // needed, use BindWithErrno. -func (dut *DUT) Bind(fd int32, sa unix.Sockaddr) { - dut.t.Helper() +func (dut *DUT) Bind(t *testing.T, fd int32, sa unix.Sockaddr) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, err := dut.BindWithErrno(ctx, fd, sa) + ret, err := dut.BindWithErrno(ctx, t, fd, sa) if ret != 0 { - dut.t.Fatalf("failed to bind socket: %s", err) + t.Fatalf("failed to bind socket: %s", err) } } // BindWithErrno calls bind on the DUT. -func (dut *DUT) BindWithErrno(ctx context.Context, fd int32, sa unix.Sockaddr) (int32, error) { - dut.t.Helper() +func (dut *DUT) BindWithErrno(ctx context.Context, t *testing.T, fd int32, sa unix.Sockaddr) (int32, error) { + t.Helper() + req := pb.BindRequest{ Sockfd: fd, - Addr: dut.sockaddrToProto(sa), + Addr: dut.sockaddrToProto(t, sa), } resp, err := dut.posixServer.Bind(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call Bind: %s", err) + t.Fatalf("failed to call Bind: %s", err) } return resp.GetRet(), syscall.Errno(resp.GetErrno_()) } @@ -212,25 +221,27 @@ func (dut *DUT) BindWithErrno(ctx context.Context, fd int32, sa unix.Sockaddr) ( // Close calls close on the DUT and causes a fatal test failure if it doesn't // succeed. If more control over the timeout or error handling is needed, use // CloseWithErrno. -func (dut *DUT) Close(fd int32) { - dut.t.Helper() +func (dut *DUT) Close(t *testing.T, fd int32) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, err := dut.CloseWithErrno(ctx, fd) + ret, err := dut.CloseWithErrno(ctx, t, fd) if ret != 0 { - dut.t.Fatalf("failed to close: %s", err) + t.Fatalf("failed to close: %s", err) } } // CloseWithErrno calls close on the DUT. -func (dut *DUT) CloseWithErrno(ctx context.Context, fd int32) (int32, error) { - dut.t.Helper() +func (dut *DUT) CloseWithErrno(ctx context.Context, t *testing.T, fd int32) (int32, error) { + t.Helper() + req := pb.CloseRequest{ Fd: fd, } resp, err := dut.posixServer.Close(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call Close: %s", err) + t.Fatalf("failed to call Close: %s", err) } return resp.GetRet(), syscall.Errno(resp.GetErrno_()) } @@ -238,28 +249,30 @@ func (dut *DUT) CloseWithErrno(ctx context.Context, fd int32) (int32, error) { // Connect calls connect on the DUT and causes a fatal test failure if it // doesn't succeed. If more control over the timeout or error handling is // needed, use ConnectWithErrno. -func (dut *DUT) Connect(fd int32, sa unix.Sockaddr) { - dut.t.Helper() +func (dut *DUT) Connect(t *testing.T, fd int32, sa unix.Sockaddr) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, err := dut.ConnectWithErrno(ctx, fd, sa) + ret, err := dut.ConnectWithErrno(ctx, t, fd, sa) // Ignore 'operation in progress' error that can be returned when the socket // is non-blocking. if err != syscall.Errno(unix.EINPROGRESS) && ret != 0 { - dut.t.Fatalf("failed to connect socket: %s", err) + t.Fatalf("failed to connect socket: %s", err) } } // ConnectWithErrno calls bind on the DUT. -func (dut *DUT) ConnectWithErrno(ctx context.Context, fd int32, sa unix.Sockaddr) (int32, error) { - dut.t.Helper() +func (dut *DUT) ConnectWithErrno(ctx context.Context, t *testing.T, fd int32, sa unix.Sockaddr) (int32, error) { + t.Helper() + req := pb.ConnectRequest{ Sockfd: fd, - Addr: dut.sockaddrToProto(sa), + Addr: dut.sockaddrToProto(t, sa), } resp, err := dut.posixServer.Connect(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call Connect: %s", err) + t.Fatalf("failed to call Connect: %s", err) } return resp.GetRet(), syscall.Errno(resp.GetErrno_()) } @@ -267,20 +280,22 @@ func (dut *DUT) ConnectWithErrno(ctx context.Context, fd int32, sa unix.Sockaddr // Fcntl calls fcntl on the DUT and causes a fatal test failure if it // doesn't succeed. If more control over the timeout or error handling is // needed, use FcntlWithErrno. -func (dut *DUT) Fcntl(fd, cmd, arg int32) int32 { - dut.t.Helper() +func (dut *DUT) Fcntl(t *testing.T, fd, cmd, arg int32) int32 { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, err := dut.FcntlWithErrno(ctx, fd, cmd, arg) + ret, err := dut.FcntlWithErrno(ctx, t, fd, cmd, arg) if ret == -1 { - dut.t.Fatalf("failed to Fcntl: ret=%d, errno=%s", ret, err) + t.Fatalf("failed to Fcntl: ret=%d, errno=%s", ret, err) } return ret } // FcntlWithErrno calls fcntl on the DUT. -func (dut *DUT) FcntlWithErrno(ctx context.Context, fd, cmd, arg int32) (int32, error) { - dut.t.Helper() +func (dut *DUT) FcntlWithErrno(ctx context.Context, t *testing.T, fd, cmd, arg int32) (int32, error) { + t.Helper() + req := pb.FcntlRequest{ Fd: fd, Cmd: cmd, @@ -288,7 +303,7 @@ func (dut *DUT) FcntlWithErrno(ctx context.Context, fd, cmd, arg int32) (int32, } resp, err := dut.posixServer.Fcntl(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call Fcntl: %s", err) + t.Fatalf("failed to call Fcntl: %s", err) } return resp.GetRet(), syscall.Errno(resp.GetErrno_()) } @@ -296,32 +311,35 @@ func (dut *DUT) FcntlWithErrno(ctx context.Context, fd, cmd, arg int32) (int32, // GetSockName calls getsockname on the DUT and causes a fatal test failure if // it doesn't succeed. If more control over the timeout or error handling is // needed, use GetSockNameWithErrno. -func (dut *DUT) GetSockName(sockfd int32) unix.Sockaddr { - dut.t.Helper() +func (dut *DUT) GetSockName(t *testing.T, sockfd int32) unix.Sockaddr { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, sa, err := dut.GetSockNameWithErrno(ctx, sockfd) + ret, sa, err := dut.GetSockNameWithErrno(ctx, t, sockfd) if ret != 0 { - dut.t.Fatalf("failed to getsockname: %s", err) + t.Fatalf("failed to getsockname: %s", err) } return sa } // GetSockNameWithErrno calls getsockname on the DUT. -func (dut *DUT) GetSockNameWithErrno(ctx context.Context, sockfd int32) (int32, unix.Sockaddr, error) { - dut.t.Helper() +func (dut *DUT) GetSockNameWithErrno(ctx context.Context, t *testing.T, sockfd int32) (int32, unix.Sockaddr, error) { + t.Helper() + req := pb.GetSockNameRequest{ Sockfd: sockfd, } resp, err := dut.posixServer.GetSockName(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call Bind: %s", err) + t.Fatalf("failed to call Bind: %s", err) } - return resp.GetRet(), dut.protoToSockaddr(resp.GetAddr()), syscall.Errno(resp.GetErrno_()) + return resp.GetRet(), dut.protoToSockaddr(t, resp.GetAddr()), syscall.Errno(resp.GetErrno_()) } -func (dut *DUT) getSockOpt(ctx context.Context, sockfd, level, optname, optlen int32, typ pb.GetSockOptRequest_SockOptType) (int32, *pb.SockOptVal, error) { - dut.t.Helper() +func (dut *DUT) getSockOpt(ctx context.Context, t *testing.T, sockfd, level, optname, optlen int32, typ pb.GetSockOptRequest_SockOptType) (int32, *pb.SockOptVal, error) { + t.Helper() + req := pb.GetSockOptRequest{ Sockfd: sockfd, Level: level, @@ -331,11 +349,11 @@ func (dut *DUT) getSockOpt(ctx context.Context, sockfd, level, optname, optlen i } resp, err := dut.posixServer.GetSockOpt(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call GetSockOpt: %s", err) + t.Fatalf("failed to call GetSockOpt: %s", err) } optval := resp.GetOptval() if optval == nil { - dut.t.Fatalf("GetSockOpt response does not contain a value") + t.Fatalf("GetSockOpt response does not contain a value") } return resp.GetRet(), optval, syscall.Errno(resp.GetErrno_()) } @@ -345,13 +363,14 @@ func (dut *DUT) getSockOpt(ctx context.Context, sockfd, level, optname, optlen i // needed, use GetSockOptWithErrno. Because endianess and the width of values // might differ between the testbench and DUT architectures, prefer to use a // more specific GetSockOptXxx function. -func (dut *DUT) GetSockOpt(sockfd, level, optname, optlen int32) []byte { - dut.t.Helper() +func (dut *DUT) GetSockOpt(t *testing.T, sockfd, level, optname, optlen int32) []byte { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, optval, err := dut.GetSockOptWithErrno(ctx, sockfd, level, optname, optlen) + ret, optval, err := dut.GetSockOptWithErrno(ctx, t, sockfd, level, optname, optlen) if ret != 0 { - dut.t.Fatalf("failed to GetSockOpt: %s", err) + t.Fatalf("failed to GetSockOpt: %s", err) } return optval } @@ -359,12 +378,13 @@ func (dut *DUT) GetSockOpt(sockfd, level, optname, optlen int32) []byte { // GetSockOptWithErrno calls getsockopt on the DUT. Because endianess and the // width of values might differ between the testbench and DUT architectures, // prefer to use a more specific GetSockOptXxxWithErrno function. -func (dut *DUT) GetSockOptWithErrno(ctx context.Context, sockfd, level, optname, optlen int32) (int32, []byte, error) { - dut.t.Helper() - ret, optval, errno := dut.getSockOpt(ctx, sockfd, level, optname, optlen, pb.GetSockOptRequest_BYTES) +func (dut *DUT) GetSockOptWithErrno(ctx context.Context, t *testing.T, sockfd, level, optname, optlen int32) (int32, []byte, error) { + t.Helper() + + ret, optval, errno := dut.getSockOpt(ctx, t, sockfd, level, optname, optlen, pb.GetSockOptRequest_BYTES) bytesval, ok := optval.Val.(*pb.SockOptVal_Bytesval) if !ok { - dut.t.Fatalf("GetSockOpt got value type: %T, want bytes", optval) + t.Fatalf("GetSockOpt got value type: %T, want bytes", optval.Val) } return ret, bytesval.Bytesval, errno } @@ -372,24 +392,26 @@ func (dut *DUT) GetSockOptWithErrno(ctx context.Context, sockfd, level, optname, // GetSockOptInt calls getsockopt on the DUT and causes a fatal test failure // if it doesn't succeed. If more control over the int optval or error handling // is needed, use GetSockOptIntWithErrno. -func (dut *DUT) GetSockOptInt(sockfd, level, optname int32) int32 { - dut.t.Helper() +func (dut *DUT) GetSockOptInt(t *testing.T, sockfd, level, optname int32) int32 { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, intval, err := dut.GetSockOptIntWithErrno(ctx, sockfd, level, optname) + ret, intval, err := dut.GetSockOptIntWithErrno(ctx, t, sockfd, level, optname) if ret != 0 { - dut.t.Fatalf("failed to GetSockOptInt: %s", err) + t.Fatalf("failed to GetSockOptInt: %s", err) } return intval } // GetSockOptIntWithErrno calls getsockopt with an integer optval. -func (dut *DUT) GetSockOptIntWithErrno(ctx context.Context, sockfd, level, optname int32) (int32, int32, error) { - dut.t.Helper() - ret, optval, errno := dut.getSockOpt(ctx, sockfd, level, optname, 0, pb.GetSockOptRequest_INT) +func (dut *DUT) GetSockOptIntWithErrno(ctx context.Context, t *testing.T, sockfd, level, optname int32) (int32, int32, error) { + t.Helper() + + ret, optval, errno := dut.getSockOpt(ctx, t, sockfd, level, optname, 0, pb.GetSockOptRequest_INT) intval, ok := optval.Val.(*pb.SockOptVal_Intval) if !ok { - dut.t.Fatalf("GetSockOpt got value type: %T, want int", optval) + t.Fatalf("GetSockOpt got value type: %T, want int", optval.Val) } return ret, intval.Intval, errno } @@ -397,24 +419,26 @@ func (dut *DUT) GetSockOptIntWithErrno(ctx context.Context, sockfd, level, optna // GetSockOptTimeval calls getsockopt on the DUT and causes a fatal test failure // if it doesn't succeed. If more control over the timeout or error handling is // needed, use GetSockOptTimevalWithErrno. -func (dut *DUT) GetSockOptTimeval(sockfd, level, optname int32) unix.Timeval { - dut.t.Helper() +func (dut *DUT) GetSockOptTimeval(t *testing.T, sockfd, level, optname int32) unix.Timeval { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, timeval, err := dut.GetSockOptTimevalWithErrno(ctx, sockfd, level, optname) + ret, timeval, err := dut.GetSockOptTimevalWithErrno(ctx, t, sockfd, level, optname) if ret != 0 { - dut.t.Fatalf("failed to GetSockOptTimeval: %s", err) + t.Fatalf("failed to GetSockOptTimeval: %s", err) } return timeval } // GetSockOptTimevalWithErrno calls getsockopt and returns a timeval. -func (dut *DUT) GetSockOptTimevalWithErrno(ctx context.Context, sockfd, level, optname int32) (int32, unix.Timeval, error) { - dut.t.Helper() - ret, optval, errno := dut.getSockOpt(ctx, sockfd, level, optname, 0, pb.GetSockOptRequest_TIME) +func (dut *DUT) GetSockOptTimevalWithErrno(ctx context.Context, t *testing.T, sockfd, level, optname int32) (int32, unix.Timeval, error) { + t.Helper() + + ret, optval, errno := dut.getSockOpt(ctx, t, sockfd, level, optname, 0, pb.GetSockOptRequest_TIME) tv, ok := optval.Val.(*pb.SockOptVal_Timeval) if !ok { - dut.t.Fatalf("GetSockOpt got value type: %T, want timeval", optval) + t.Fatalf("GetSockOpt got value type: %T, want timeval", optval.Val) } timeval := unix.Timeval{ Sec: tv.Timeval.Seconds, @@ -426,26 +450,28 @@ func (dut *DUT) GetSockOptTimevalWithErrno(ctx context.Context, sockfd, level, o // Listen calls listen on the DUT and causes a fatal test failure if it doesn't // succeed. If more control over the timeout or error handling is needed, use // ListenWithErrno. -func (dut *DUT) Listen(sockfd, backlog int32) { - dut.t.Helper() +func (dut *DUT) Listen(t *testing.T, sockfd, backlog int32) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, err := dut.ListenWithErrno(ctx, sockfd, backlog) + ret, err := dut.ListenWithErrno(ctx, t, sockfd, backlog) if ret != 0 { - dut.t.Fatalf("failed to listen: %s", err) + t.Fatalf("failed to listen: %s", err) } } // ListenWithErrno calls listen on the DUT. -func (dut *DUT) ListenWithErrno(ctx context.Context, sockfd, backlog int32) (int32, error) { - dut.t.Helper() +func (dut *DUT) ListenWithErrno(ctx context.Context, t *testing.T, sockfd, backlog int32) (int32, error) { + t.Helper() + req := pb.ListenRequest{ Sockfd: sockfd, Backlog: backlog, } resp, err := dut.posixServer.Listen(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call Listen: %s", err) + t.Fatalf("failed to call Listen: %s", err) } return resp.GetRet(), syscall.Errno(resp.GetErrno_()) } @@ -453,20 +479,22 @@ func (dut *DUT) ListenWithErrno(ctx context.Context, sockfd, backlog int32) (int // Send calls send on the DUT and causes a fatal test failure if it doesn't // succeed. If more control over the timeout or error handling is needed, use // SendWithErrno. -func (dut *DUT) Send(sockfd int32, buf []byte, flags int32) int32 { - dut.t.Helper() +func (dut *DUT) Send(t *testing.T, sockfd int32, buf []byte, flags int32) int32 { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, err := dut.SendWithErrno(ctx, sockfd, buf, flags) + ret, err := dut.SendWithErrno(ctx, t, sockfd, buf, flags) if ret == -1 { - dut.t.Fatalf("failed to send: %s", err) + t.Fatalf("failed to send: %s", err) } return ret } // SendWithErrno calls send on the DUT. -func (dut *DUT) SendWithErrno(ctx context.Context, sockfd int32, buf []byte, flags int32) (int32, error) { - dut.t.Helper() +func (dut *DUT) SendWithErrno(ctx context.Context, t *testing.T, sockfd int32, buf []byte, flags int32) (int32, error) { + t.Helper() + req := pb.SendRequest{ Sockfd: sockfd, Buf: buf, @@ -474,7 +502,7 @@ func (dut *DUT) SendWithErrno(ctx context.Context, sockfd int32, buf []byte, fla } resp, err := dut.posixServer.Send(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call Send: %s", err) + t.Fatalf("failed to call Send: %s", err) } return resp.GetRet(), syscall.Errno(resp.GetErrno_()) } @@ -482,48 +510,52 @@ func (dut *DUT) SendWithErrno(ctx context.Context, sockfd int32, buf []byte, fla // SendTo calls sendto on the DUT and causes a fatal test failure if it doesn't // succeed. If more control over the timeout or error handling is needed, use // SendToWithErrno. -func (dut *DUT) SendTo(sockfd int32, buf []byte, flags int32, destAddr unix.Sockaddr) int32 { - dut.t.Helper() +func (dut *DUT) SendTo(t *testing.T, sockfd int32, buf []byte, flags int32, destAddr unix.Sockaddr) int32 { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, err := dut.SendToWithErrno(ctx, sockfd, buf, flags, destAddr) + ret, err := dut.SendToWithErrno(ctx, t, sockfd, buf, flags, destAddr) if ret == -1 { - dut.t.Fatalf("failed to sendto: %s", err) + t.Fatalf("failed to sendto: %s", err) } return ret } // SendToWithErrno calls sendto on the DUT. -func (dut *DUT) SendToWithErrno(ctx context.Context, sockfd int32, buf []byte, flags int32, destAddr unix.Sockaddr) (int32, error) { - dut.t.Helper() +func (dut *DUT) SendToWithErrno(ctx context.Context, t *testing.T, sockfd int32, buf []byte, flags int32, destAddr unix.Sockaddr) (int32, error) { + t.Helper() + req := pb.SendToRequest{ Sockfd: sockfd, Buf: buf, Flags: flags, - DestAddr: dut.sockaddrToProto(destAddr), + DestAddr: dut.sockaddrToProto(t, destAddr), } resp, err := dut.posixServer.SendTo(ctx, &req) if err != nil { - dut.t.Fatalf("faled to call SendTo: %s", err) + t.Fatalf("faled to call SendTo: %s", err) } return resp.GetRet(), syscall.Errno(resp.GetErrno_()) } // SetNonBlocking will set O_NONBLOCK flag for fd if nonblocking // is true, otherwise it will clear the flag. -func (dut *DUT) SetNonBlocking(fd int32, nonblocking bool) { - dut.t.Helper() - flags := dut.Fcntl(fd, unix.F_GETFL, 0) +func (dut *DUT) SetNonBlocking(t *testing.T, fd int32, nonblocking bool) { + t.Helper() + + flags := dut.Fcntl(t, fd, unix.F_GETFL, 0) if nonblocking { flags |= unix.O_NONBLOCK } else { flags &= ^unix.O_NONBLOCK } - dut.Fcntl(fd, unix.F_SETFL, flags) + dut.Fcntl(t, fd, unix.F_SETFL, flags) } -func (dut *DUT) setSockOpt(ctx context.Context, sockfd, level, optname int32, optval *pb.SockOptVal) (int32, error) { - dut.t.Helper() +func (dut *DUT) setSockOpt(ctx context.Context, t *testing.T, sockfd, level, optname int32, optval *pb.SockOptVal) (int32, error) { + t.Helper() + req := pb.SetSockOptRequest{ Sockfd: sockfd, Level: level, @@ -532,7 +564,7 @@ func (dut *DUT) setSockOpt(ctx context.Context, sockfd, level, optname int32, op } resp, err := dut.posixServer.SetSockOpt(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call SetSockOpt: %s", err) + t.Fatalf("failed to call SetSockOpt: %s", err) } return resp.GetRet(), syscall.Errno(resp.GetErrno_()) } @@ -542,81 +574,89 @@ func (dut *DUT) setSockOpt(ctx context.Context, sockfd, level, optname int32, op // needed, use SetSockOptWithErrno. Because endianess and the width of values // might differ between the testbench and DUT architectures, prefer to use a // more specific SetSockOptXxx function. -func (dut *DUT) SetSockOpt(sockfd, level, optname int32, optval []byte) { - dut.t.Helper() +func (dut *DUT) SetSockOpt(t *testing.T, sockfd, level, optname int32, optval []byte) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, err := dut.SetSockOptWithErrno(ctx, sockfd, level, optname, optval) + ret, err := dut.SetSockOptWithErrno(ctx, t, sockfd, level, optname, optval) if ret != 0 { - dut.t.Fatalf("failed to SetSockOpt: %s", err) + t.Fatalf("failed to SetSockOpt: %s", err) } } // SetSockOptWithErrno calls setsockopt on the DUT. Because endianess and the // width of values might differ between the testbench and DUT architectures, // prefer to use a more specific SetSockOptXxxWithErrno function. -func (dut *DUT) SetSockOptWithErrno(ctx context.Context, sockfd, level, optname int32, optval []byte) (int32, error) { - dut.t.Helper() - return dut.setSockOpt(ctx, sockfd, level, optname, &pb.SockOptVal{Val: &pb.SockOptVal_Bytesval{optval}}) +func (dut *DUT) SetSockOptWithErrno(ctx context.Context, t *testing.T, sockfd, level, optname int32, optval []byte) (int32, error) { + t.Helper() + + return dut.setSockOpt(ctx, t, sockfd, level, optname, &pb.SockOptVal{Val: &pb.SockOptVal_Bytesval{optval}}) } // SetSockOptInt calls setsockopt on the DUT and causes a fatal test failure // if it doesn't succeed. If more control over the int optval or error handling // is needed, use SetSockOptIntWithErrno. -func (dut *DUT) SetSockOptInt(sockfd, level, optname, optval int32) { - dut.t.Helper() +func (dut *DUT) SetSockOptInt(t *testing.T, sockfd, level, optname, optval int32) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, err := dut.SetSockOptIntWithErrno(ctx, sockfd, level, optname, optval) + ret, err := dut.SetSockOptIntWithErrno(ctx, t, sockfd, level, optname, optval) if ret != 0 { - dut.t.Fatalf("failed to SetSockOptInt: %s", err) + t.Fatalf("failed to SetSockOptInt: %s", err) } } // SetSockOptIntWithErrno calls setsockopt with an integer optval. -func (dut *DUT) SetSockOptIntWithErrno(ctx context.Context, sockfd, level, optname, optval int32) (int32, error) { - dut.t.Helper() - return dut.setSockOpt(ctx, sockfd, level, optname, &pb.SockOptVal{Val: &pb.SockOptVal_Intval{optval}}) +func (dut *DUT) SetSockOptIntWithErrno(ctx context.Context, t *testing.T, sockfd, level, optname, optval int32) (int32, error) { + t.Helper() + + return dut.setSockOpt(ctx, t, sockfd, level, optname, &pb.SockOptVal{Val: &pb.SockOptVal_Intval{optval}}) } // SetSockOptTimeval calls setsockopt on the DUT and causes a fatal test failure // if it doesn't succeed. If more control over the timeout or error handling is // needed, use SetSockOptTimevalWithErrno. -func (dut *DUT) SetSockOptTimeval(sockfd, level, optname int32, tv *unix.Timeval) { - dut.t.Helper() +func (dut *DUT) SetSockOptTimeval(t *testing.T, sockfd, level, optname int32, tv *unix.Timeval) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, err := dut.SetSockOptTimevalWithErrno(ctx, sockfd, level, optname, tv) + ret, err := dut.SetSockOptTimevalWithErrno(ctx, t, sockfd, level, optname, tv) if ret != 0 { - dut.t.Fatalf("failed to SetSockOptTimeval: %s", err) + t.Fatalf("failed to SetSockOptTimeval: %s", err) } } // SetSockOptTimevalWithErrno calls setsockopt with the timeval converted to // bytes. -func (dut *DUT) SetSockOptTimevalWithErrno(ctx context.Context, sockfd, level, optname int32, tv *unix.Timeval) (int32, error) { - dut.t.Helper() +func (dut *DUT) SetSockOptTimevalWithErrno(ctx context.Context, t *testing.T, sockfd, level, optname int32, tv *unix.Timeval) (int32, error) { + t.Helper() + timeval := pb.Timeval{ Seconds: int64(tv.Sec), Microseconds: int64(tv.Usec), } - return dut.setSockOpt(ctx, sockfd, level, optname, &pb.SockOptVal{Val: &pb.SockOptVal_Timeval{&timeval}}) + return dut.setSockOpt(ctx, t, sockfd, level, optname, &pb.SockOptVal{Val: &pb.SockOptVal_Timeval{&timeval}}) } // Socket calls socket on the DUT and returns the file descriptor. If socket // fails on the DUT, the test ends. -func (dut *DUT) Socket(domain, typ, proto int32) int32 { - dut.t.Helper() - fd, err := dut.SocketWithErrno(domain, typ, proto) +func (dut *DUT) Socket(t *testing.T, domain, typ, proto int32) int32 { + t.Helper() + + fd, err := dut.SocketWithErrno(t, domain, typ, proto) if fd < 0 { - dut.t.Fatalf("failed to create socket: %s", err) + t.Fatalf("failed to create socket: %s", err) } return fd } // SocketWithErrno calls socket on the DUT and returns the fd and errno. -func (dut *DUT) SocketWithErrno(domain, typ, proto int32) (int32, error) { - dut.t.Helper() +func (dut *DUT) SocketWithErrno(t *testing.T, domain, typ, proto int32) (int32, error) { + t.Helper() + req := pb.SocketRequest{ Domain: domain, Type: typ, @@ -625,7 +665,7 @@ func (dut *DUT) SocketWithErrno(domain, typ, proto int32) (int32, error) { ctx := context.Background() resp, err := dut.posixServer.Socket(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call Socket: %s", err) + t.Fatalf("failed to call Socket: %s", err) } return resp.GetFd(), syscall.Errno(resp.GetErrno_()) } @@ -633,20 +673,22 @@ func (dut *DUT) SocketWithErrno(domain, typ, proto int32) (int32, error) { // Recv calls recv on the DUT and causes a fatal test failure if it doesn't // succeed. If more control over the timeout or error handling is needed, use // RecvWithErrno. -func (dut *DUT) Recv(sockfd, len, flags int32) []byte { - dut.t.Helper() +func (dut *DUT) Recv(t *testing.T, sockfd, len, flags int32) []byte { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) defer cancel() - ret, buf, err := dut.RecvWithErrno(ctx, sockfd, len, flags) + ret, buf, err := dut.RecvWithErrno(ctx, t, sockfd, len, flags) if ret == -1 { - dut.t.Fatalf("failed to recv: %s", err) + t.Fatalf("failed to recv: %s", err) } return buf } // RecvWithErrno calls recv on the DUT. -func (dut *DUT) RecvWithErrno(ctx context.Context, sockfd, len, flags int32) (int32, []byte, error) { - dut.t.Helper() +func (dut *DUT) RecvWithErrno(ctx context.Context, t *testing.T, sockfd, len, flags int32) (int32, []byte, error) { + t.Helper() + req := pb.RecvRequest{ Sockfd: sockfd, Len: len, @@ -654,7 +696,7 @@ func (dut *DUT) RecvWithErrno(ctx context.Context, sockfd, len, flags int32) (in } resp, err := dut.posixServer.Recv(ctx, &req) if err != nil { - dut.t.Fatalf("failed to call Recv: %s", err) + t.Fatalf("failed to call Recv: %s", err) } return resp.GetRet(), resp.GetBuf(), syscall.Errno(resp.GetErrno_()) } diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go index 645f6c1a9..24aa46cce 100644 --- a/test/packetimpact/testbench/layers.go +++ b/test/packetimpact/testbench/layers.go @@ -805,7 +805,11 @@ func (l *ICMPv6) ToBytes() ([]byte, error) { // We need to search forward to find the IPv6 header. for prev := l.Prev(); prev != nil; prev = prev.Prev() { if ipv6, ok := prev.(*IPv6); ok { - h.SetChecksum(header.ICMPv6Checksum(h, *ipv6.SrcAddr, *ipv6.DstAddr, buffer.VectorisedView{})) + payload, err := payload(l) + if err != nil { + return nil, err + } + h.SetChecksum(header.ICMPv6Checksum(h, *ipv6.SrcAddr, *ipv6.DstAddr, payload)) break } } diff --git a/test/packetimpact/testbench/rawsockets.go b/test/packetimpact/testbench/rawsockets.go index 278229b7e..57e822725 100644 --- a/test/packetimpact/testbench/rawsockets.go +++ b/test/packetimpact/testbench/rawsockets.go @@ -28,7 +28,6 @@ import ( // Sniffer can sniff raw packets on the wire. type Sniffer struct { - t *testing.T fd int } @@ -40,6 +39,8 @@ func htons(x uint16) uint16 { // NewSniffer creates a Sniffer connected to *device. func NewSniffer(t *testing.T) (Sniffer, error) { + t.Helper() + snifferFd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, int(htons(unix.ETH_P_ALL))) if err != nil { return Sniffer{}, err @@ -51,7 +52,6 @@ func NewSniffer(t *testing.T) (Sniffer, error) { t.Fatalf("can't setsockopt SO_RCVBUF to 10M: %s", err) } return Sniffer{ - t: t, fd: snifferFd, }, nil } @@ -61,7 +61,9 @@ func NewSniffer(t *testing.T) (Sniffer, error) { const maxReadSize int = 65536 // Recv tries to read one frame until the timeout is up. -func (s *Sniffer) Recv(timeout time.Duration) []byte { +func (s *Sniffer) Recv(t *testing.T, timeout time.Duration) []byte { + t.Helper() + deadline := time.Now().Add(timeout) for { timeout = deadline.Sub(time.Now()) @@ -75,7 +77,7 @@ func (s *Sniffer) Recv(timeout time.Duration) []byte { } if err := unix.SetsockoptTimeval(s.fd, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &tv); err != nil { - s.t.Fatalf("can't setsockopt SO_RCVTIMEO: %s", err) + t.Fatalf("can't setsockopt SO_RCVTIMEO: %s", err) } buf := make([]byte, maxReadSize) @@ -85,10 +87,10 @@ func (s *Sniffer) Recv(timeout time.Duration) []byte { continue } if err != nil { - s.t.Fatalf("can't read: %s", err) + t.Fatalf("can't read: %s", err) } if nread > maxReadSize { - s.t.Fatalf("received a truncated frame of %d bytes", nread) + t.Fatalf("received a truncated frame of %d bytes, want at most %d bytes", nread, maxReadSize) } return buf[:nread] } @@ -96,14 +98,16 @@ func (s *Sniffer) Recv(timeout time.Duration) []byte { // Drain drains the Sniffer's socket receive buffer by receiving until there's // nothing else to receive. -func (s *Sniffer) Drain() { - s.t.Helper() +func (s *Sniffer) Drain(t *testing.T) { + t.Helper() + flags, err := unix.FcntlInt(uintptr(s.fd), unix.F_GETFL, 0) if err != nil { - s.t.Fatalf("failed to get sniffer socket fd flags: %s", err) + t.Fatalf("failed to get sniffer socket fd flags: %s", err) } - if _, err := unix.FcntlInt(uintptr(s.fd), unix.F_SETFL, flags|unix.O_NONBLOCK); err != nil { - s.t.Fatalf("failed to make sniffer socket non-blocking: %s", err) + nonBlockingFlags := flags | unix.O_NONBLOCK + if _, err := unix.FcntlInt(uintptr(s.fd), unix.F_SETFL, nonBlockingFlags); err != nil { + t.Fatalf("failed to make sniffer socket non-blocking with flags %b: %s", nonBlockingFlags, err) } for { buf := make([]byte, maxReadSize) @@ -113,7 +117,7 @@ func (s *Sniffer) Drain() { } } if _, err := unix.FcntlInt(uintptr(s.fd), unix.F_SETFL, flags); err != nil { - s.t.Fatalf("failed to restore sniffer socket fd flags: %s", err) + t.Fatalf("failed to restore sniffer socket fd flags to %b: %s", flags, err) } } @@ -128,12 +132,13 @@ func (s *Sniffer) close() error { // Injector can inject raw frames. type Injector struct { - t *testing.T fd int } // NewInjector creates a new injector on *device. func NewInjector(t *testing.T) (Injector, error) { + t.Helper() + ifInfo, err := net.InterfaceByName(Device) if err != nil { return Injector{}, err @@ -156,15 +161,20 @@ func NewInjector(t *testing.T) (Injector, error) { return Injector{}, err } return Injector{ - t: t, fd: injectFd, }, nil } // Send a raw frame. -func (i *Injector) Send(b []byte) { - if _, err := unix.Write(i.fd, b); err != nil { - i.t.Fatalf("can't write: %s of len %d", err, len(b)) +func (i *Injector) Send(t *testing.T, b []byte) { + t.Helper() + + n, err := unix.Write(i.fd, b) + if err != nil { + t.Fatalf("can't write bytes of len %d: %s", len(b), err) + } + if n != len(b) { + t.Fatalf("got %d bytes written, want %d", n, len(b)) } } diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD index 6a07889be..27905dcff 100644 --- a/test/packetimpact/tests/BUILD +++ b/test/packetimpact/tests/BUILD @@ -220,6 +220,16 @@ packetimpact_go_test( ) packetimpact_go_test( + name = "tcp_network_unreachable", + srcs = ["tcp_network_unreachable_test.go"], + deps = [ + "//pkg/tcpip/header", + "//test/packetimpact/testbench", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +packetimpact_go_test( name = "tcp_cork_mss", srcs = ["tcp_cork_mss_test.go"], deps = [ diff --git a/test/packetimpact/tests/fin_wait2_timeout_test.go b/test/packetimpact/tests/fin_wait2_timeout_test.go index 407565078..a61054c2c 100644 --- a/test/packetimpact/tests/fin_wait2_timeout_test.go +++ b/test/packetimpact/tests/fin_wait2_timeout_test.go @@ -39,34 +39,34 @@ func TestFinWait2Timeout(t *testing.T) { t.Run(tt.description, func(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFd) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() - conn.Connect() + defer conn.Close(t) + conn.Connect(t) - acceptFd, _ := dut.Accept(listenFd) + acceptFd, _ := dut.Accept(t, listenFd) if tt.linger2 { tv := unix.Timeval{Sec: 1, Usec: 0} - dut.SetSockOptTimeval(acceptFd, unix.SOL_TCP, unix.TCP_LINGER2, &tv) + dut.SetSockOptTimeval(t, acceptFd, unix.SOL_TCP, unix.TCP_LINGER2, &tv) } - dut.Close(acceptFd) + dut.Close(t, acceptFd) - if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil { + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil { t.Fatalf("expected a FIN-ACK within 1 second but got none: %s", err) } - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) time.Sleep(5 * time.Second) - conn.Drain() + conn.Drain(t) - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) if tt.linger2 { - if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil { + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil { t.Fatalf("expected a RST packet within a second but got none: %s", err) } } else { - if got, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, 10*time.Second); got != nil || err == nil { + if got, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, 10*time.Second); got != nil || err == nil { t.Fatalf("expected no RST packets within ten seconds but got one: %s", got) } } diff --git a/test/packetimpact/tests/icmpv6_param_problem_test.go b/test/packetimpact/tests/icmpv6_param_problem_test.go index 8dfd26ee8..2d59d552d 100644 --- a/test/packetimpact/tests/icmpv6_param_problem_test.go +++ b/test/packetimpact/tests/icmpv6_param_problem_test.go @@ -34,7 +34,7 @@ func TestICMPv6ParamProblemTest(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() conn := testbench.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) - defer conn.Close() + defer conn.Close(t) ipv6 := testbench.IPv6{ // 254 is reserved and used for experimentation and testing. This should // cause an error. @@ -45,8 +45,8 @@ func TestICMPv6ParamProblemTest(t *testing.T) { Payload: []byte("hello world"), } - toSend := (*testbench.Connection)(&conn).CreateFrame(testbench.Layers{&ipv6}, &icmpv6) - (*testbench.Connection)(&conn).SendFrame(toSend) + toSend := (*testbench.Connection)(&conn).CreateFrame(t, testbench.Layers{&ipv6}, &icmpv6) + (*testbench.Connection)(&conn).SendFrame(t, toSend) // Build the expected ICMPv6 payload, which includes an index to the // problematic byte and also the problematic packet as described in @@ -72,7 +72,7 @@ func TestICMPv6ParamProblemTest(t *testing.T) { &expectedICMPv6, } timeout := time.Second - if _, err := conn.ExpectFrame(paramProblem, timeout); err != nil { + if _, err := conn.ExpectFrame(t, paramProblem, timeout); err != nil { t.Errorf("expected %s within %s but got none: %s", paramProblem, timeout, err) } } diff --git a/test/packetimpact/tests/ipv4_id_uniqueness_test.go b/test/packetimpact/tests/ipv4_id_uniqueness_test.go index 70f6df5e0..cf881418c 100644 --- a/test/packetimpact/tests/ipv4_id_uniqueness_test.go +++ b/test/packetimpact/tests/ipv4_id_uniqueness_test.go @@ -31,8 +31,8 @@ func init() { testbench.RegisterFlags(flag.CommandLine) } -func recvTCPSegment(conn *testbench.TCPIPv4, expect *testbench.TCP, expectPayload *testbench.Payload) (uint16, error) { - layers, err := conn.ExpectData(expect, expectPayload, time.Second) +func recvTCPSegment(t *testing.T, conn *testbench.TCPIPv4, expect *testbench.TCP, expectPayload *testbench.Payload) (uint16, error) { + layers, err := conn.ExpectData(t, expect, expectPayload, time.Second) if err != nil { return 0, fmt.Errorf("failed to receive TCP segment: %s", err) } @@ -69,17 +69,17 @@ func TestIPv4RetransmitIdentificationUniqueness(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFD) + listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFD) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) - conn.Connect() - remoteFD, _ := dut.Accept(listenFD) - defer dut.Close(remoteFD) + conn.Connect(t) + remoteFD, _ := dut.Accept(t, listenFD) + defer dut.Close(t, remoteFD) - dut.SetSockOptInt(remoteFD, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + dut.SetSockOptInt(t, remoteFD, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) // TODO(b/129291778) The following socket option clears the DF bit on // IP packets sent over the socket, and is currently not supported by @@ -87,30 +87,30 @@ func TestIPv4RetransmitIdentificationUniqueness(t *testing.T) { // socket option being not supported does not affect the operation of // this test. Once the socket option is supported, the following call // can be changed to simply assert success. - ret, errno := dut.SetSockOptIntWithErrno(context.Background(), remoteFD, unix.IPPROTO_IP, linux.IP_MTU_DISCOVER, linux.IP_PMTUDISC_DONT) + ret, errno := dut.SetSockOptIntWithErrno(context.Background(), t, remoteFD, unix.IPPROTO_IP, linux.IP_MTU_DISCOVER, linux.IP_PMTUDISC_DONT) if ret == -1 && errno != unix.ENOTSUP { t.Fatalf("failed to set IP_MTU_DISCOVER socket option to IP_PMTUDISC_DONT: %s", errno) } samplePayload := &testbench.Payload{Bytes: tc.payload} - dut.Send(remoteFD, tc.payload, 0) - if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil { + dut.Send(t, remoteFD, tc.payload, 0) + if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { t.Fatalf("failed to receive TCP segment sent for RTT calculation: %s", err) } // Let the DUT estimate RTO with RTT from the DATA-ACK. // TODO(gvisor.dev/issue/2685) Estimate RTO during handshake, after which // we can skip sending this ACK. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) - dut.Send(remoteFD, tc.payload, 0) - expectTCP := &testbench.TCP{SeqNum: testbench.Uint32(uint32(*conn.RemoteSeqNum()))} - originalID, err := recvTCPSegment(&conn, expectTCP, samplePayload) + dut.Send(t, remoteFD, tc.payload, 0) + expectTCP := &testbench.TCP{SeqNum: testbench.Uint32(uint32(*conn.RemoteSeqNum(t)))} + originalID, err := recvTCPSegment(t, &conn, expectTCP, samplePayload) if err != nil { t.Fatalf("failed to receive TCP segment: %s", err) } - retransmitID, err := recvTCPSegment(&conn, expectTCP, samplePayload) + retransmitID, err := recvTCPSegment(t, &conn, expectTCP, samplePayload) if err != nil { t.Fatalf("failed to receive retransmitted TCP segment: %s", err) } diff --git a/test/packetimpact/tests/ipv6_fragment_reassembly_test.go b/test/packetimpact/tests/ipv6_fragment_reassembly_test.go index 7b462c8e2..b5f94ad4b 100644 --- a/test/packetimpact/tests/ipv6_fragment_reassembly_test.go +++ b/test/packetimpact/tests/ipv6_fragment_reassembly_test.go @@ -48,7 +48,7 @@ func TestIPv6FragmentReassembly(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() conn := testbench.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) - defer conn.Close() + defer conn.Close(t) firstPayloadToSend := make([]byte, firstPayloadLength) for i := range firstPayloadToSend { @@ -81,7 +81,7 @@ func TestIPv6FragmentReassembly(t *testing.T) { buffer.NewVectorisedView(len(secondPayloadToSend), []buffer.View{secondPayloadToSend}), ) - conn.Send(testbench.IPv6{}, + conn.Send(t, testbench.IPv6{}, &testbench.IPv6FragmentExtHdr{ FragmentOffset: testbench.Uint16(0), MoreFragments: testbench.Bool(true), @@ -96,7 +96,7 @@ func TestIPv6FragmentReassembly(t *testing.T) { icmpv6ProtoNum := header.IPv6ExtensionHeaderIdentifier(header.ICMPv6ProtocolNumber) - conn.Send(testbench.IPv6{}, + conn.Send(t, testbench.IPv6{}, &testbench.IPv6FragmentExtHdr{ NextHeader: &icmpv6ProtoNum, FragmentOffset: testbench.Uint16((firstPayloadLength + header.ICMPv6EchoMinimumSize) / 8), @@ -107,7 +107,7 @@ func TestIPv6FragmentReassembly(t *testing.T) { Bytes: secondPayloadToSend, }) - gotEchoReplyFirstPart, err := conn.ExpectFrame(testbench.Layers{ + gotEchoReplyFirstPart, err := conn.ExpectFrame(t, testbench.Layers{ &testbench.Ether{}, &testbench.IPv6{}, &testbench.IPv6FragmentExtHdr{ @@ -142,7 +142,7 @@ func TestIPv6FragmentReassembly(t *testing.T) { hex.Dump(wantFirstPayload)) } - gotEchoReplySecondPart, err := conn.ExpectFrame(testbench.Layers{ + gotEchoReplySecondPart, err := conn.ExpectFrame(t, testbench.Layers{ &testbench.Ether{}, &testbench.IPv6{}, &testbench.IPv6FragmentExtHdr{ diff --git a/test/packetimpact/tests/ipv6_unknown_options_action_test.go b/test/packetimpact/tests/ipv6_unknown_options_action_test.go index 100b30ad7..d7d63cbd2 100644 --- a/test/packetimpact/tests/ipv6_unknown_options_action_test.go +++ b/test/packetimpact/tests/ipv6_unknown_options_action_test.go @@ -23,21 +23,21 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" - tb "gvisor.dev/gvisor/test/packetimpact/testbench" + "gvisor.dev/gvisor/test/packetimpact/testbench" ) func init() { - tb.RegisterFlags(flag.CommandLine) + testbench.RegisterFlags(flag.CommandLine) } -func mkHopByHopOptionsExtHdr(optType byte) tb.Layer { - return &tb.IPv6HopByHopOptionsExtHdr{ +func mkHopByHopOptionsExtHdr(optType byte) testbench.Layer { + return &testbench.IPv6HopByHopOptionsExtHdr{ Options: []byte{optType, 0x04, 0x00, 0x00, 0x00, 0x00}, } } -func mkDestinationOptionsExtHdr(optType byte) tb.Layer { - return &tb.IPv6DestinationOptionsExtHdr{ +func mkDestinationOptionsExtHdr(optType byte) testbench.Layer { + return &testbench.IPv6DestinationOptionsExtHdr{ Options: []byte{optType, 0x04, 0x00, 0x00, 0x00, 0x00}, } } @@ -49,7 +49,7 @@ func optionTypeFromAction(action header.IPv6OptionUnknownAction) byte { func TestIPv6UnknownOptionAction(t *testing.T) { for _, tt := range []struct { description string - mkExtHdr func(optType byte) tb.Layer + mkExtHdr func(optType byte) testbench.Layer action header.IPv6OptionUnknownAction multicastDst bool wantICMPv6 bool @@ -140,21 +140,21 @@ func TestIPv6UnknownOptionAction(t *testing.T) { }, } { t.Run(tt.description, func(t *testing.T) { - dut := tb.NewDUT(t) + dut := testbench.NewDUT(t) defer dut.TearDown() - ipv6Conn := tb.NewIPv6Conn(t, tb.IPv6{}, tb.IPv6{}) - conn := (*tb.Connection)(&ipv6Conn) - defer ipv6Conn.Close() + ipv6Conn := testbench.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) + conn := (*testbench.Connection)(&ipv6Conn) + defer ipv6Conn.Close(t) - outgoingOverride := tb.Layers{} + outgoingOverride := testbench.Layers{} if tt.multicastDst { - outgoingOverride = tb.Layers{&tb.IPv6{ - DstAddr: tb.Address(tcpip.Address(net.ParseIP("ff02::1"))), + outgoingOverride = testbench.Layers{&testbench.IPv6{ + DstAddr: testbench.Address(tcpip.Address(net.ParseIP("ff02::1"))), }} } - outgoing := conn.CreateFrame(outgoingOverride, tt.mkExtHdr(optionTypeFromAction(tt.action))) - conn.SendFrame(outgoing) + outgoing := conn.CreateFrame(t, outgoingOverride, tt.mkExtHdr(optionTypeFromAction(tt.action))) + conn.SendFrame(t, outgoing) ipv6Sent := outgoing[1:] invokingPacket, err := ipv6Sent.ToBytes() if err != nil { @@ -167,12 +167,12 @@ func TestIPv6UnknownOptionAction(t *testing.T) { // after the IPv6 header (after NextHeader and ExtHdrLen). binary.BigEndian.PutUint32(icmpv6Payload, header.IPv6MinimumSize+2) icmpv6Payload = append(icmpv6Payload, invokingPacket...) - gotICMPv6, err := ipv6Conn.ExpectFrame(tb.Layers{ - &tb.Ether{}, - &tb.IPv6{}, - &tb.ICMPv6{ - Type: tb.ICMPv6Type(header.ICMPv6ParamProblem), - Code: tb.Byte(2), + gotICMPv6, err := ipv6Conn.ExpectFrame(t, testbench.Layers{ + &testbench.Ether{}, + &testbench.IPv6{}, + &testbench.ICMPv6{ + Type: testbench.ICMPv6Type(header.ICMPv6ParamProblem), + Code: testbench.Byte(2), Payload: icmpv6Payload, }, }, time.Second) diff --git a/test/packetimpact/tests/tcp_close_wait_ack_test.go b/test/packetimpact/tests/tcp_close_wait_ack_test.go index 6e7ff41d7..e6a96f214 100644 --- a/test/packetimpact/tests/tcp_close_wait_ack_test.go +++ b/test/packetimpact/tests/tcp_close_wait_ack_test.go @@ -33,39 +33,39 @@ func init() { func TestCloseWaitAck(t *testing.T) { for _, tt := range []struct { description string - makeTestingTCP func(conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP + makeTestingTCP func(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset, windowSize seqnum.Size) testbench.TCP seqNumOffset seqnum.Size expectAck bool }{ - {"OTW", GenerateOTWSeqSegment, 0, false}, - {"OTW", GenerateOTWSeqSegment, 1, true}, - {"OTW", GenerateOTWSeqSegment, 2, true}, - {"ACK", GenerateUnaccACKSegment, 0, false}, - {"ACK", GenerateUnaccACKSegment, 1, true}, - {"ACK", GenerateUnaccACKSegment, 2, true}, + {"OTW", generateOTWSeqSegment, 0, false}, + {"OTW", generateOTWSeqSegment, 1, true}, + {"OTW", generateOTWSeqSegment, 2, true}, + {"ACK", generateUnaccACKSegment, 0, false}, + {"ACK", generateUnaccACKSegment, 1, true}, + {"ACK", generateUnaccACKSegment, 2, true}, } { t.Run(fmt.Sprintf("%s%d", tt.description, tt.seqNumOffset), func(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFd) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) - conn.Connect() - acceptFd, _ := dut.Accept(listenFd) + conn.Connect(t) + acceptFd, _ := dut.Accept(t, listenFd) // Send a FIN to DUT to intiate the active close - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)}) - gotTCP, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)}) + gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) if err != nil { t.Fatalf("expected an ACK for our fin and DUT should enter CLOSE_WAIT: %s", err) } windowSize := seqnum.Size(*gotTCP.WindowSize) // Send a segment with OTW Seq / unacc ACK and expect an ACK back - conn.Send(tt.makeTestingTCP(&conn, tt.seqNumOffset, windowSize), &testbench.Payload{Bytes: []byte("Sample Data")}) - gotAck, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) + conn.Send(t, tt.makeTestingTCP(t, &conn, tt.seqNumOffset, windowSize), &testbench.Payload{Bytes: []byte("Sample Data")}) + gotAck, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) if tt.expectAck && err != nil { t.Fatalf("expected an ack but got none: %s", err) } @@ -74,35 +74,36 @@ func TestCloseWaitAck(t *testing.T) { } // Now let's verify DUT is indeed in CLOSE_WAIT - dut.Close(acceptFd) - if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)}, time.Second); err != nil { + dut.Close(t, acceptFd) + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)}, time.Second); err != nil { t.Fatalf("expected DUT to send a FIN: %s", err) } // Ack the FIN from DUT - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) // Send some extra data to DUT - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: []byte("Sample Data")}) - if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: []byte("Sample Data")}) + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil { t.Fatalf("expected DUT to send an RST: %s", err) } }) } } -// This generates an segment with seqnum = RCV.NXT + RCV.WND + seqNumOffset, the -// generated segment is only acceptable when seqNumOffset is 0, otherwise an ACK -// is expected from the receiver. -func GenerateOTWSeqSegment(conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP { - lastAcceptable := conn.LocalSeqNum().Add(windowSize) +// generateOTWSeqSegment generates an segment with +// seqnum = RCV.NXT + RCV.WND + seqNumOffset, the generated segment is only +// acceptable when seqNumOffset is 0, otherwise an ACK is expected from the +// receiver. +func generateOTWSeqSegment(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP { + lastAcceptable := conn.LocalSeqNum(t).Add(windowSize) otwSeq := uint32(lastAcceptable.Add(seqNumOffset)) return testbench.TCP{SeqNum: testbench.Uint32(otwSeq), Flags: testbench.Uint8(header.TCPFlagAck)} } -// This generates an segment with acknum = SND.NXT + seqNumOffset, the generated -// segment is only acceptable when seqNumOffset is 0, otherwise an ACK is -// expected from the receiver. -func GenerateUnaccACKSegment(conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP { - lastAcceptable := conn.RemoteSeqNum() +// generateUnaccACKSegment generates an segment with +// acknum = SND.NXT + seqNumOffset, the generated segment is only acceptable +// when seqNumOffset is 0, otherwise an ACK is expected from the receiver. +func generateUnaccACKSegment(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP { + lastAcceptable := conn.RemoteSeqNum(t) unaccAck := uint32(lastAcceptable.Add(seqNumOffset)) return testbench.TCP{AckNum: testbench.Uint32(unaccAck), Flags: testbench.Uint8(header.TCPFlagAck)} } diff --git a/test/packetimpact/tests/tcp_cork_mss_test.go b/test/packetimpact/tests/tcp_cork_mss_test.go index fb8f48629..8feea4a82 100644 --- a/test/packetimpact/tests/tcp_cork_mss_test.go +++ b/test/packetimpact/tests/tcp_cork_mss_test.go @@ -32,53 +32,53 @@ func init() { func TestTCPCorkMSS(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFD) + listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFD) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) const mss = uint32(header.TCPDefaultMSS) options := make([]byte, header.TCPOptionMSSLength) header.EncodeMSSOption(mss, options) - conn.ConnectWithOptions(options) + conn.ConnectWithOptions(t, options) - acceptFD, _ := dut.Accept(listenFD) - defer dut.Close(acceptFD) + acceptFD, _ := dut.Accept(t, listenFD) + defer dut.Close(t, acceptFD) - dut.SetSockOptInt(acceptFD, unix.IPPROTO_TCP, unix.TCP_CORK, 1) + dut.SetSockOptInt(t, acceptFD, unix.IPPROTO_TCP, unix.TCP_CORK, 1) // Let the dut application send 2 small segments to be held up and coalesced // until the application sends a larger segment to fill up to > MSS. sampleData := []byte("Sample Data") - dut.Send(acceptFD, sampleData, 0) - dut.Send(acceptFD, sampleData, 0) + dut.Send(t, acceptFD, sampleData, 0) + dut.Send(t, acceptFD, sampleData, 0) expectedData := sampleData expectedData = append(expectedData, sampleData...) largeData := make([]byte, mss+1) expectedData = append(expectedData, largeData...) - dut.Send(acceptFD, largeData, 0) + dut.Send(t, acceptFD, largeData, 0) // Expect the segments to be coalesced and sent and capped to MSS. expectedPayload := testbench.Payload{Bytes: expectedData[:mss]} - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &expectedPayload, time.Second); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &expectedPayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) // Expect the coalesced segment to be split and transmitted. expectedPayload = testbench.Payload{Bytes: expectedData[mss:]} - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &expectedPayload, time.Second); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &expectedPayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } // Check for segments to *not* be held up because of TCP_CORK when // the current send window is less than MSS. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(uint16(2 * len(sampleData)))}) - dut.Send(acceptFD, sampleData, 0) - dut.Send(acceptFD, sampleData, 0) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(uint16(2 * len(sampleData)))}) + dut.Send(t, acceptFD, sampleData, 0) + dut.Send(t, acceptFD, sampleData, 0) expectedPayload = testbench.Payload{Bytes: append(sampleData, sampleData...)} - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &expectedPayload, time.Second); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &expectedPayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) } diff --git a/test/packetimpact/tests/tcp_handshake_window_size_test.go b/test/packetimpact/tests/tcp_handshake_window_size_test.go index 652b530d0..22937d92f 100644 --- a/test/packetimpact/tests/tcp_handshake_window_size_test.go +++ b/test/packetimpact/tests/tcp_handshake_window_size_test.go @@ -33,14 +33,14 @@ func init() { func TestTCPHandshakeWindowSize(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFD) + listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFD) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) // Start handshake with zero window size. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn), WindowSize: testbench.Uint16(uint16(0))}) - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn), WindowSize: testbench.Uint16(uint16(0))}) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil { t.Fatalf("expected SYN-ACK: %s", err) } // Update the advertised window size to a non-zero value with the ACK that @@ -48,10 +48,10 @@ func TestTCPHandshakeWindowSize(t *testing.T) { // // Set the window size with MSB set and expect the dut to treat it as // an unsigned value. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(uint16(1 << 15))}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(uint16(1 << 15))}) - acceptFd, _ := dut.Accept(listenFD) - defer dut.Close(acceptFd) + acceptFd, _ := dut.Accept(t, listenFD) + defer dut.Close(t, acceptFd) sampleData := []byte("Sample Data") samplePayload := &testbench.Payload{Bytes: sampleData} @@ -59,8 +59,8 @@ func TestTCPHandshakeWindowSize(t *testing.T) { // Since we advertised a zero window followed by a non-zero window, // expect the dut to honor the recently advertised non-zero window // and actually send out the data instead of probing for zero window. - dut.Send(acceptFd, sampleData, 0) - if _, err := conn.ExpectNextData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload, time.Second); err != nil { + dut.Send(t, acceptFd, sampleData, 0) + if _, err := conn.ExpectNextData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } } diff --git a/test/packetimpact/tests/tcp_network_unreachable_test.go b/test/packetimpact/tests/tcp_network_unreachable_test.go new file mode 100644 index 000000000..900352fa1 --- /dev/null +++ b/test/packetimpact/tests/tcp_network_unreachable_test.go @@ -0,0 +1,139 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_synsent_reset_test + +import ( + "context" + "flag" + "net" + "syscall" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/test/packetimpact/testbench" +) + +func init() { + testbench.RegisterFlags(flag.CommandLine) +} + +// TestTCPSynSentUnreachable verifies that TCP connections fail immediately when +// an ICMP destination unreachable message is sent in response to the inital +// SYN. +func TestTCPSynSentUnreachable(t *testing.T) { + // Create the DUT and connection. + dut := testbench.NewDUT(t) + defer dut.TearDown() + clientFD, clientPort := dut.CreateBoundSocket(t, unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4)) + port := uint16(9001) + conn := testbench.NewTCPIPv4(t, testbench.TCP{SrcPort: &port, DstPort: &clientPort}, testbench.TCP{SrcPort: &clientPort, DstPort: &port}) + defer conn.Close(t) + + // Bring the DUT to SYN-SENT state with a non-blocking connect. + ctx, cancel := context.WithTimeout(context.Background(), testbench.RPCTimeout) + defer cancel() + sa := unix.SockaddrInet4{Port: int(port)} + copy(sa.Addr[:], net.IP(net.ParseIP(testbench.LocalIPv4)).To4()) + if _, err := dut.ConnectWithErrno(ctx, t, clientFD, &sa); err != syscall.Errno(unix.EINPROGRESS) { + t.Errorf("expected connect to fail with EINPROGRESS, but got %v", err) + } + + // Get the SYN. + tcpLayers, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}, nil, time.Second) + if err != nil { + t.Fatalf("expected SYN: %s", err) + } + + // Send a host unreachable message. + rawConn := (*testbench.Connection)(&conn) + layers := rawConn.CreateFrame(t, nil) + layers = layers[:len(layers)-1] + const ipLayer = 1 + const tcpLayer = ipLayer + 1 + ip, ok := tcpLayers[ipLayer].(*testbench.IPv4) + if !ok { + t.Fatalf("expected %s to be IPv4", tcpLayers[ipLayer]) + } + tcp, ok := tcpLayers[tcpLayer].(*testbench.TCP) + if !ok { + t.Fatalf("expected %s to be TCP", tcpLayers[tcpLayer]) + } + var icmpv4 testbench.ICMPv4 = testbench.ICMPv4{Type: testbench.ICMPv4Type(header.ICMPv4DstUnreachable), Code: testbench.Uint8(header.ICMPv4HostUnreachable)} + layers = append(layers, &icmpv4, ip, tcp) + rawConn.SendFrameStateless(t, layers) + + if _, err = dut.ConnectWithErrno(ctx, t, clientFD, &sa); err != syscall.Errno(unix.EHOSTUNREACH) { + t.Errorf("expected connect to fail with EHOSTUNREACH, but got %v", err) + } +} + +// TestTCPSynSentUnreachable6 verifies that TCP connections fail immediately when +// an ICMP destination unreachable message is sent in response to the inital +// SYN. +func TestTCPSynSentUnreachable6(t *testing.T) { + // Create the DUT and connection. + dut := testbench.NewDUT(t) + defer dut.TearDown() + clientFD, clientPort := dut.CreateBoundSocket(t, unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv6)) + conn := testbench.NewTCPIPv6(t, testbench.TCP{DstPort: &clientPort}, testbench.TCP{SrcPort: &clientPort}) + defer conn.Close(t) + + // Bring the DUT to SYN-SENT state with a non-blocking connect. + ctx, cancel := context.WithTimeout(context.Background(), testbench.RPCTimeout) + defer cancel() + sa := unix.SockaddrInet6{ + Port: int(conn.SrcPort()), + ZoneId: uint32(testbench.RemoteInterfaceID), + } + copy(sa.Addr[:], net.IP(net.ParseIP(testbench.LocalIPv6)).To16()) + if _, err := dut.ConnectWithErrno(ctx, t, clientFD, &sa); err != syscall.Errno(unix.EINPROGRESS) { + t.Errorf("expected connect to fail with EINPROGRESS, but got %v", err) + } + + // Get the SYN. + tcpLayers, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}, nil, time.Second) + if err != nil { + t.Fatalf("expected SYN: %s", err) + } + + // Send a host unreachable message. + rawConn := (*testbench.Connection)(&conn) + layers := rawConn.CreateFrame(t, nil) + layers = layers[:len(layers)-1] + const ipLayer = 1 + const tcpLayer = ipLayer + 1 + ip, ok := tcpLayers[ipLayer].(*testbench.IPv6) + if !ok { + t.Fatalf("expected %s to be IPv6", tcpLayers[ipLayer]) + } + tcp, ok := tcpLayers[tcpLayer].(*testbench.TCP) + if !ok { + t.Fatalf("expected %s to be TCP", tcpLayers[tcpLayer]) + } + var icmpv6 testbench.ICMPv6 = testbench.ICMPv6{ + Type: testbench.ICMPv6Type(header.ICMPv6DstUnreachable), + Code: testbench.Uint8(header.ICMPv6NetworkUnreachable), + // Per RFC 4443 3.1, the payload contains 4 zeroed bytes. + Payload: []byte{0, 0, 0, 0}, + } + layers = append(layers, &icmpv6, ip, tcp) + rawConn.SendFrameStateless(t, layers) + + if _, err = dut.ConnectWithErrno(ctx, t, clientFD, &sa); err != syscall.Errno(unix.ENETUNREACH) { + t.Errorf("expected connect to fail with ENETUNREACH, but got %v", err) + } +} diff --git a/test/packetimpact/tests/tcp_noaccept_close_rst_test.go b/test/packetimpact/tests/tcp_noaccept_close_rst_test.go index b9b3e91d3..82b7a85ff 100644 --- a/test/packetimpact/tests/tcp_noaccept_close_rst_test.go +++ b/test/packetimpact/tests/tcp_noaccept_close_rst_test.go @@ -31,12 +31,12 @@ func init() { func TestTcpNoAcceptCloseReset(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - conn.Connect() - defer conn.Close() - dut.Close(listenFd) - if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, 1*time.Second); err != nil { + conn.Connect(t) + defer conn.Close(t) + dut.Close(t, listenFd) + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, 1*time.Second); err != nil { t.Fatalf("expected a RST-ACK packet but got none: %s", err) } } diff --git a/test/packetimpact/tests/tcp_outside_the_window_test.go b/test/packetimpact/tests/tcp_outside_the_window_test.go index ad8c74234..08f759f7c 100644 --- a/test/packetimpact/tests/tcp_outside_the_window_test.go +++ b/test/packetimpact/tests/tcp_outside_the_window_test.go @@ -63,25 +63,25 @@ func TestTCPOutsideTheWindow(t *testing.T) { t.Run(fmt.Sprintf("%s%d", tt.description, tt.seqNumOffset), func(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFD) + listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFD) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() - conn.Connect() - acceptFD, _ := dut.Accept(listenFD) - defer dut.Close(acceptFD) + defer conn.Close(t) + conn.Connect(t) + acceptFD, _ := dut.Accept(t, listenFD) + defer dut.Close(t, acceptFD) - windowSize := seqnum.Size(*conn.SynAck().WindowSize) + tt.seqNumOffset - conn.Drain() + windowSize := seqnum.Size(*conn.SynAck(t).WindowSize) + tt.seqNumOffset + conn.Drain(t) // Ignore whatever incrementing that this out-of-order packet might cause // to the AckNum. - localSeqNum := testbench.Uint32(uint32(*conn.LocalSeqNum())) - conn.Send(testbench.TCP{ + localSeqNum := testbench.Uint32(uint32(*conn.LocalSeqNum(t))) + conn.Send(t, testbench.TCP{ Flags: testbench.Uint8(tt.tcpFlags), - SeqNum: testbench.Uint32(uint32(conn.LocalSeqNum().Add(windowSize))), + SeqNum: testbench.Uint32(uint32(conn.LocalSeqNum(t).Add(windowSize))), }, tt.payload...) timeout := 3 * time.Second - gotACK, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: localSeqNum}, timeout) + gotACK, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: localSeqNum}, timeout) if tt.expectACK && err != nil { t.Fatalf("expected an ACK packet within %s but got none: %s", timeout, err) } diff --git a/test/packetimpact/tests/tcp_paws_mechanism_test.go b/test/packetimpact/tests/tcp_paws_mechanism_test.go index 55db4ece6..37f3b56dd 100644 --- a/test/packetimpact/tests/tcp_paws_mechanism_test.go +++ b/test/packetimpact/tests/tcp_paws_mechanism_test.go @@ -32,15 +32,15 @@ func init() { func TestPAWSMechanism(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFD) + listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFD) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) options := make([]byte, header.TCPOptionTSLength) header.EncodeTSOption(currentTS(), 0, options) - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn), Options: options}) - synAck, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn), Options: options}) + synAck, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second) if err != nil { t.Fatalf("didn't get synack during handshake: %s", err) } @@ -50,9 +50,9 @@ func TestPAWSMechanism(t *testing.T) { } tsecr := parsedSynOpts.TSVal header.EncodeTSOption(currentTS(), tsecr, options) - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options}) - acceptFD, _ := dut.Accept(listenFD) - defer dut.Close(acceptFD) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options}) + acceptFD, _ := dut.Accept(t, listenFD) + defer dut.Close(t, acceptFD) sampleData := []byte("Sample Data") sentTSVal := currentTS() @@ -61,9 +61,9 @@ func TestPAWSMechanism(t *testing.T) { // every time we send one, it should not cause any flakiness because timestamps // only need to be non-decreasing. time.Sleep(3 * time.Millisecond) - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options}, &testbench.Payload{Bytes: sampleData}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options}, &testbench.Payload{Bytes: sampleData}) - gotTCP, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) + gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) if err != nil { t.Fatalf("expected an ACK but got none: %s", err) } @@ -86,9 +86,9 @@ func TestPAWSMechanism(t *testing.T) { // 3ms here is chosen arbitrarily and this time.Sleep() should not cause flakiness // due to the exact same reasoning discussed above. time.Sleep(3 * time.Millisecond) - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options}, &testbench.Payload{Bytes: sampleData}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options}, &testbench.Payload{Bytes: sampleData}) - gotTCP, err = conn.Expect(testbench.TCP{AckNum: lastAckNum, Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) + gotTCP, err = conn.Expect(t, testbench.TCP{AckNum: lastAckNum, Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) if err != nil { t.Fatalf("expected segment with AckNum %d but got none: %s", lastAckNum, err) } diff --git a/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go b/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go index 8fbec893b..d9f3ea0f2 100644 --- a/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go +++ b/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go @@ -52,26 +52,26 @@ func TestQueueReceiveInSynSent(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - socket, remotePort := dut.CreateBoundSocket(unix.SOCK_STREAM, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4)) + socket, remotePort := dut.CreateBoundSocket(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4)) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) sampleData := []byte("Sample Data") - dut.SetNonBlocking(socket, true) - if _, err := dut.ConnectWithErrno(context.Background(), socket, conn.LocalAddr()); !errors.Is(err, syscall.EINPROGRESS) { + dut.SetNonBlocking(t, socket, true) + if _, err := dut.ConnectWithErrno(context.Background(), t, socket, conn.LocalAddr(t)); !errors.Is(err, syscall.EINPROGRESS) { t.Fatalf("failed to bring DUT to SYN-SENT, got: %s, want EINPROGRESS", err) } - if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}, time.Second); err != nil { + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}, time.Second); err != nil { t.Fatalf("expected a SYN from DUT, but got none: %s", err) } - if _, _, err := dut.RecvWithErrno(context.Background(), socket, int32(len(sampleData)), 0); err != syscall.Errno(unix.EWOULDBLOCK) { + if _, _, err := dut.RecvWithErrno(context.Background(), t, socket, int32(len(sampleData)), 0); err != syscall.Errno(unix.EWOULDBLOCK) { t.Fatalf("expected error %s, got %s", syscall.Errno(unix.EWOULDBLOCK), err) } // Test blocking read. - dut.SetNonBlocking(socket, false) + dut.SetNonBlocking(t, socket, false) var wg sync.WaitGroup defer wg.Wait() @@ -86,7 +86,7 @@ func TestQueueReceiveInSynSent(t *testing.T) { block.Done() // Issue RECEIVE call in SYN-SENT, this should be queued for // process until the connection is established. - n, buff, err := dut.RecvWithErrno(ctx, socket, int32(len(sampleData)), 0) + n, buff, err := dut.RecvWithErrno(ctx, t, socket, int32(len(sampleData)), 0) if tt.reset { if err != syscall.Errno(unix.ECONNREFUSED) { t.Errorf("expected error %s, got %s", syscall.Errno(unix.ECONNREFUSED), err) @@ -112,19 +112,19 @@ func TestQueueReceiveInSynSent(t *testing.T) { time.Sleep(100 * time.Millisecond) if tt.reset { - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}) return } // Bring the connection to Established. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}) - if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}) + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil { t.Fatalf("expected an ACK from DUT, but got none: %s", err) } // Send sample payload and expect an ACK. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: sampleData}) - if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: sampleData}) + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil { t.Fatalf("expected an ACK from DUT, but got none: %s", err) } }) diff --git a/test/packetimpact/tests/tcp_reordering_test.go b/test/packetimpact/tests/tcp_reordering_test.go index a5378a9dd..8742819ca 100644 --- a/test/packetimpact/tests/tcp_reordering_test.go +++ b/test/packetimpact/tests/tcp_reordering_test.go @@ -32,10 +32,10 @@ func init() { func TestReorderingWindow(t *testing.T) { dut := tb.NewDUT(t) defer dut.TearDown() - listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFd) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) // Enable SACK. opts := make([]byte, 40) @@ -49,17 +49,17 @@ func TestReorderingWindow(t *testing.T) { const mss = minMTU - header.IPv4MinimumSize - header.TCPMinimumSize optsOff += header.EncodeMSSOption(mss, opts[optsOff:]) - conn.ConnectWithOptions(opts[:optsOff]) + conn.ConnectWithOptions(t, opts[:optsOff]) - acceptFd, _ := dut.Accept(listenFd) - defer dut.Close(acceptFd) + acceptFd, _ := dut.Accept(t, listenFd) + defer dut.Close(t, acceptFd) if tb.DUTType == "linux" { // Linux has changed its handling of reordering, force the old behavior. - dut.SetSockOpt(acceptFd, unix.IPPROTO_TCP, unix.TCP_CONGESTION, []byte("reno")) + dut.SetSockOpt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_CONGESTION, []byte("reno")) } - pls := dut.GetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_MAXSEG) + pls := dut.GetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_MAXSEG) if tb.DUTType == "netstack" { // netstack does not impliment TCP_MAXSEG correctly. Fake it // here. Netstack uses the max SACK size which is 32. The MSS @@ -69,13 +69,13 @@ func TestReorderingWindow(t *testing.T) { payload := make([]byte, pls) - seqNum1 := *conn.RemoteSeqNum() + seqNum1 := *conn.RemoteSeqNum(t) const numPkts = 10 // Send some packets, checking that we receive each. for i, sn := 0, seqNum1; i < numPkts; i++ { - dut.Send(acceptFd, payload, 0) + dut.Send(t, acceptFd, payload, 0) - gotOne, err := conn.Expect(tb.TCP{SeqNum: tb.Uint32(uint32(sn))}, time.Second) + gotOne, err := conn.Expect(t, tb.TCP{SeqNum: tb.Uint32(uint32(sn))}, time.Second) sn.UpdateForward(seqnum.Size(len(payload))) if err != nil { t.Errorf("Expect #%d: %s", i+1, err) @@ -86,7 +86,7 @@ func TestReorderingWindow(t *testing.T) { } } - seqNum2 := *conn.RemoteSeqNum() + seqNum2 := *conn.RemoteSeqNum(t) // SACK packets #2-4. sackBlock := make([]byte, 40) @@ -97,13 +97,13 @@ func TestReorderingWindow(t *testing.T) { seqNum1.Add(seqnum.Size(len(payload))), seqNum1.Add(seqnum.Size(4 * len(payload))), }}, sackBlock[sbOff:]) - conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: tb.Uint32(uint32(seqNum1)), Options: sackBlock[:sbOff]}) + conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: tb.Uint32(uint32(seqNum1)), Options: sackBlock[:sbOff]}) // ACK first packet. - conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: tb.Uint32(uint32(seqNum1) + uint32(len(payload)))}) + conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: tb.Uint32(uint32(seqNum1) + uint32(len(payload)))}) // Check for retransmit. - gotOne, err := conn.Expect(tb.TCP{SeqNum: tb.Uint32(uint32(seqNum1))}, time.Second) + gotOne, err := conn.Expect(t, tb.TCP{SeqNum: tb.Uint32(uint32(seqNum1))}, time.Second) if err != nil { t.Error("Expect for retransmit:", err) } @@ -123,14 +123,14 @@ func TestReorderingWindow(t *testing.T) { seqNum1.Add(seqnum.Size(4 * len(payload))), }}, dsackBlock[dsbOff:]) - conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: tb.Uint32(uint32(seqNum2)), Options: dsackBlock[:dsbOff]}) + conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: tb.Uint32(uint32(seqNum2)), Options: dsackBlock[:dsbOff]}) // Send half of the original window of packets, checking that we // received each. for i, sn := 0, seqNum2; i < numPkts/2; i++ { - dut.Send(acceptFd, payload, 0) + dut.Send(t, acceptFd, payload, 0) - gotOne, err := conn.Expect(tb.TCP{SeqNum: tb.Uint32(uint32(sn))}, time.Second) + gotOne, err := conn.Expect(t, tb.TCP{SeqNum: tb.Uint32(uint32(sn))}, time.Second) sn.UpdateForward(seqnum.Size(len(payload))) if err != nil { t.Errorf("Expect #%d: %s", i+1, err) @@ -144,8 +144,8 @@ func TestReorderingWindow(t *testing.T) { if tb.DUTType == "netstack" { // The window should now be halved, so we should receive any // more, even if we send them. - dut.Send(acceptFd, payload, 0) - if got, err := conn.Expect(tb.TCP{}, 100*time.Millisecond); got != nil || err == nil { + dut.Send(t, acceptFd, payload, 0) + if got, err := conn.Expect(t, tb.TCP{}, 100*time.Millisecond); got != nil || err == nil { t.Fatalf("expected no packets within 100 millisecond, but got one: %s", got) } return @@ -153,9 +153,9 @@ func TestReorderingWindow(t *testing.T) { // Linux reduces the window by three. Check that we can receive the rest. for i, sn := 0, seqNum2.Add(seqnum.Size(numPkts/2*len(payload))); i < 2; i++ { - dut.Send(acceptFd, payload, 0) + dut.Send(t, acceptFd, payload, 0) - gotOne, err := conn.Expect(tb.TCP{SeqNum: tb.Uint32(uint32(sn))}, time.Second) + gotOne, err := conn.Expect(t, tb.TCP{SeqNum: tb.Uint32(uint32(sn))}, time.Second) sn.UpdateForward(seqnum.Size(len(payload))) if err != nil { t.Errorf("Expect #%d: %s", i+1, err) @@ -167,8 +167,8 @@ func TestReorderingWindow(t *testing.T) { } // The window should now be full. - dut.Send(acceptFd, payload, 0) - if got, err := conn.Expect(tb.TCP{}, 100*time.Millisecond); got != nil || err == nil { + dut.Send(t, acceptFd, payload, 0) + if got, err := conn.Expect(t, tb.TCP{}, 100*time.Millisecond); got != nil || err == nil { t.Fatalf("expected no packets within 100 millisecond, but got one: %s", got) } } diff --git a/test/packetimpact/tests/tcp_retransmits_test.go b/test/packetimpact/tests/tcp_retransmits_test.go index 6940eb7fb..072014ff8 100644 --- a/test/packetimpact/tests/tcp_retransmits_test.go +++ b/test/packetimpact/tests/tcp_retransmits_test.go @@ -33,41 +33,41 @@ func init() { func TestRetransmits(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFd) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) - conn.Connect() - acceptFd, _ := dut.Accept(listenFd) - defer dut.Close(acceptFd) + conn.Connect(t) + acceptFd, _ := dut.Accept(t, listenFd) + defer dut.Close(t, acceptFd) - dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) sampleData := []byte("Sample Data") samplePayload := &testbench.Payload{Bytes: sampleData} - dut.Send(acceptFd, sampleData, 0) - if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil { + dut.Send(t, acceptFd, sampleData, 0) + if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } // Give a chance for the dut to estimate RTO with RTT from the DATA-ACK. // TODO(gvisor.dev/issue/2685) Estimate RTO during handshake, after which // we can skip sending this ACK. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) startRTO := time.Second current := startRTO first := time.Now() - dut.Send(acceptFd, sampleData, 0) - seq := testbench.Uint32(uint32(*conn.RemoteSeqNum())) - if _, err := conn.ExpectData(&testbench.TCP{SeqNum: seq}, samplePayload, startRTO); err != nil { + dut.Send(t, acceptFd, sampleData, 0) + seq := testbench.Uint32(uint32(*conn.RemoteSeqNum(t))) + if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: seq}, samplePayload, startRTO); err != nil { t.Fatalf("expected payload was not received: %s", err) } // Expect retransmits of the same segment. for i := 0; i < 5; i++ { start := time.Now() - if _, err := conn.ExpectData(&testbench.TCP{SeqNum: seq}, samplePayload, 2*current); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: seq}, samplePayload, 2*current); err != nil { t.Fatalf("expected payload was not received: %s loop %d", err, i) } if i == 0 { diff --git a/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go b/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go index 90ab85419..f91b06ba1 100644 --- a/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go +++ b/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go @@ -61,23 +61,23 @@ func TestSendWindowSizesPiggyback(t *testing.T) { t.Run(fmt.Sprintf("%s%d", tt.description, tt.windowSize), func(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFd) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort, WindowSize: testbench.Uint16(tt.windowSize)}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) - conn.Connect() - acceptFd, _ := dut.Accept(listenFd) - defer dut.Close(acceptFd) + conn.Connect(t) + acceptFd, _ := dut.Accept(t, listenFd) + defer dut.Close(t, acceptFd) - dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) expectedTCP := testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)} - dut.Send(acceptFd, sampleData, 0) + dut.Send(t, acceptFd, sampleData, 0) expectedPayload := testbench.Payload{Bytes: tt.expectedPayload1} - if _, err := conn.ExpectData(&expectedTCP, &expectedPayload, time.Second); err != nil { + if _, err := conn.ExpectData(t, &expectedTCP, &expectedPayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } @@ -86,18 +86,18 @@ func TestSendWindowSizesPiggyback(t *testing.T) { if tt.enqueue { // Enqueue a segment for the dut to transmit. - dut.Send(acceptFd, sampleData, 0) + dut.Send(t, acceptFd, sampleData, 0) } // Send ACK for the previous segment along with data for the dut to // receive and ACK back. Sending this ACK would make room for the dut // to transmit any enqueued segment. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh), WindowSize: testbench.Uint16(tt.windowSize)}, &testbench.Payload{Bytes: sampleData}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh), WindowSize: testbench.Uint16(tt.windowSize)}, &testbench.Payload{Bytes: sampleData}) // Expect the dut to piggyback the ACK for received data along with // the segment enqueued for transmit. expectedPayload = testbench.Payload{Bytes: tt.expectedPayload2} - if _, err := conn.ExpectData(&expectedTCP, &expectedPayload, time.Second); err != nil { + if _, err := conn.ExpectData(t, &expectedTCP, &expectedPayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } }) diff --git a/test/packetimpact/tests/tcp_synrcvd_reset_test.go b/test/packetimpact/tests/tcp_synrcvd_reset_test.go index 7d5deab01..57d034dd1 100644 --- a/test/packetimpact/tests/tcp_synrcvd_reset_test.go +++ b/test/packetimpact/tests/tcp_synrcvd_reset_test.go @@ -32,21 +32,21 @@ func init() { func TestTCPSynRcvdReset(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFD) + listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFD) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) // Expect dut connection to have transitioned to SYN-RCVD state. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}) - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil { t.Fatalf("expected SYN-ACK %s", err) } - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}) // Expect the connection to have transitioned SYN-RCVD to CLOSED. // TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { t.Fatalf("expected a TCP RST %s", err) } } diff --git a/test/packetimpact/tests/tcp_synsent_reset_test.go b/test/packetimpact/tests/tcp_synsent_reset_test.go index 6898a2239..eac8eb19d 100644 --- a/test/packetimpact/tests/tcp_synsent_reset_test.go +++ b/test/packetimpact/tests/tcp_synsent_reset_test.go @@ -31,17 +31,19 @@ func init() { // dutSynSentState sets up the dut connection in SYN-SENT state. func dutSynSentState(t *testing.T) (*tb.DUT, *tb.TCPIPv4, uint16, uint16) { + t.Helper() + dut := tb.NewDUT(t) - clientFD, clientPort := dut.CreateBoundSocket(unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, net.ParseIP(tb.RemoteIPv4)) + clientFD, clientPort := dut.CreateBoundSocket(t, unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, net.ParseIP(tb.RemoteIPv4)) port := uint16(9001) conn := tb.NewTCPIPv4(t, tb.TCP{SrcPort: &port, DstPort: &clientPort}, tb.TCP{SrcPort: &clientPort, DstPort: &port}) sa := unix.SockaddrInet4{Port: int(port)} copy(sa.Addr[:], net.IP(net.ParseIP(tb.LocalIPv4)).To4()) // Bring the dut to SYN-SENT state with a non-blocking connect. - dut.Connect(clientFD, &sa) - if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)}, nil, time.Second); err != nil { + dut.Connect(t, clientFD, &sa) + if _, err := conn.ExpectData(t, &tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)}, nil, time.Second); err != nil { t.Fatalf("expected SYN\n") } @@ -51,13 +53,13 @@ func dutSynSentState(t *testing.T) (*tb.DUT, *tb.TCPIPv4, uint16, uint16) { // TestTCPSynSentReset tests RFC793, p67: SYN-SENT to CLOSED transition. func TestTCPSynSentReset(t *testing.T) { dut, conn, _, _ := dutSynSentState(t) - defer conn.Close() + defer conn.Close(t) defer dut.TearDown() - conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst | header.TCPFlagAck)}) + conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagRst | header.TCPFlagAck)}) // Expect the connection to have closed. // TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side. - conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}) - if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { + conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}) + if _, err := conn.ExpectData(t, &tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { t.Fatalf("expected a TCP RST") } } @@ -67,22 +69,22 @@ func TestTCPSynSentReset(t *testing.T) { func TestTCPSynSentRcvdReset(t *testing.T) { dut, c, remotePort, clientPort := dutSynSentState(t) defer dut.TearDown() - defer c.Close() + defer c.Close(t) conn := tb.NewTCPIPv4(t, tb.TCP{SrcPort: &remotePort, DstPort: &clientPort}, tb.TCP{SrcPort: &clientPort, DstPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) // Initiate new SYN connection with the same port pair // (simultaneous open case), expect the dut connection to move to // SYN-RCVD state - conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)}) - if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil { + conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)}) + if _, err := conn.ExpectData(t, &tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil { t.Fatalf("expected SYN-ACK %s\n", err) } - conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}) + conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}) // Expect the connection to have transitioned SYN-RCVD to CLOSED. // TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side. - conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}) - if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { + conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}) + if _, err := conn.ExpectData(t, &tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { t.Fatalf("expected a TCP RST") } } diff --git a/test/packetimpact/tests/tcp_user_timeout_test.go b/test/packetimpact/tests/tcp_user_timeout_test.go index 87e45d765..551dc78e7 100644 --- a/test/packetimpact/tests/tcp_user_timeout_test.go +++ b/test/packetimpact/tests/tcp_user_timeout_test.go @@ -16,7 +16,6 @@ package tcp_user_timeout_test import ( "flag" - "fmt" "testing" "time" @@ -29,22 +28,20 @@ func init() { testbench.RegisterFlags(flag.CommandLine) } -func sendPayload(conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) error { +func sendPayload(t *testing.T, conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) { sampleData := make([]byte, 100) for i := range sampleData { sampleData[i] = uint8(i) } - conn.Drain() - dut.Send(fd, sampleData, 0) - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &testbench.Payload{Bytes: sampleData}, time.Second); err != nil { - return fmt.Errorf("expected data but got none: %w", err) + conn.Drain(t) + dut.Send(t, fd, sampleData, 0) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &testbench.Payload{Bytes: sampleData}, time.Second); err != nil { + t.Fatalf("expected data but got none: %w", err) } - return nil } -func sendFIN(conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) error { - dut.Close(fd) - return nil +func sendFIN(t *testing.T, conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) { + dut.Close(t, fd) } func TestTCPUserTimeout(t *testing.T) { @@ -59,7 +56,7 @@ func TestTCPUserTimeout(t *testing.T) { } { for _, ttf := range []struct { description string - f func(conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) error + f func(_ *testing.T, _ *testbench.TCPIPv4, _ *testbench.DUT, fd int32) }{ {"AfterPayload", sendPayload}, {"AfterFIN", sendFIN}, @@ -68,31 +65,29 @@ func TestTCPUserTimeout(t *testing.T) { // Create a socket, listen, TCP handshake, and accept. dut := testbench.NewDUT(t) defer dut.TearDown() - listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFD) + listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFD) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() - conn.Connect() - acceptFD, _ := dut.Accept(listenFD) + defer conn.Close(t) + conn.Connect(t) + acceptFD, _ := dut.Accept(t, listenFD) if tt.userTimeout != 0 { - dut.SetSockOptInt(acceptFD, unix.SOL_TCP, unix.TCP_USER_TIMEOUT, int32(tt.userTimeout.Milliseconds())) + dut.SetSockOptInt(t, acceptFD, unix.SOL_TCP, unix.TCP_USER_TIMEOUT, int32(tt.userTimeout.Milliseconds())) } - if err := ttf.f(&conn, &dut, acceptFD); err != nil { - t.Fatal(err) - } + ttf.f(t, &conn, &dut, acceptFD) time.Sleep(tt.sendDelay) - conn.Drain() - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + conn.Drain(t) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) // If TCP_USER_TIMEOUT was set and the above delay was longer than the // TCP_USER_TIMEOUT then the DUT should send a RST in response to the // testbench's packet. expectRST := tt.userTimeout != 0 && tt.sendDelay > tt.userTimeout expectTimeout := 5 * time.Second - got, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, expectTimeout) + got, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, expectTimeout) if expectRST && err != nil { t.Errorf("expected RST packet within %s but got none: %s", expectTimeout, err) } diff --git a/test/packetimpact/tests/tcp_window_shrink_test.go b/test/packetimpact/tests/tcp_window_shrink_test.go index e78d04756..5b001fbec 100644 --- a/test/packetimpact/tests/tcp_window_shrink_test.go +++ b/test/packetimpact/tests/tcp_window_shrink_test.go @@ -31,43 +31,43 @@ func init() { func TestWindowShrink(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFd) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) - conn.Connect() - acceptFd, _ := dut.Accept(listenFd) - defer dut.Close(acceptFd) + conn.Connect(t) + acceptFd, _ := dut.Accept(t, listenFd) + defer dut.Close(t, acceptFd) - dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) sampleData := []byte("Sample Data") samplePayload := &testbench.Payload{Bytes: sampleData} - dut.Send(acceptFd, sampleData, 0) - if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil { + dut.Send(t, acceptFd, sampleData, 0) + if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) - dut.Send(acceptFd, sampleData, 0) - dut.Send(acceptFd, sampleData, 0) - if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil { + dut.Send(t, acceptFd, sampleData, 0) + dut.Send(t, acceptFd, sampleData, 0) + if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } - if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } // We close our receiving window here - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) - dut.Send(acceptFd, []byte("Sample Data"), 0) + dut.Send(t, acceptFd, []byte("Sample Data"), 0) // Note: There is another kind of zero-window probing which Windows uses (by sending one // new byte at `RemoteSeqNum`), if netstack wants to go that way, we may want to change // the following lines. - expectedRemoteSeqNum := *conn.RemoteSeqNum() - 1 - if _, err := conn.ExpectData(&testbench.TCP{SeqNum: testbench.Uint32(uint32(expectedRemoteSeqNum))}, nil, time.Second); err != nil { + expectedRemoteSeqNum := *conn.RemoteSeqNum(t) - 1 + if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: testbench.Uint32(uint32(expectedRemoteSeqNum))}, nil, time.Second); err != nil { t.Fatalf("expected a packet with sequence number %d: %s", expectedRemoteSeqNum, err) } } diff --git a/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go index 8c89d57c9..da93267d6 100644 --- a/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go +++ b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go @@ -33,27 +33,27 @@ func init() { func TestZeroWindowProbeRetransmit(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFd) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) - conn.Connect() - acceptFd, _ := dut.Accept(listenFd) - defer dut.Close(acceptFd) + conn.Connect(t) + acceptFd, _ := dut.Accept(t, listenFd) + defer dut.Close(t, acceptFd) - dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) sampleData := []byte("Sample Data") samplePayload := &testbench.Payload{Bytes: sampleData} // Send and receive sample data to the dut. - dut.Send(acceptFd, sampleData, 0) - if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil { + dut.Send(t, acceptFd, sampleData, 0) + if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload) - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil { t.Fatalf("expected packet was not received: %s", err) } @@ -63,15 +63,15 @@ func TestZeroWindowProbeRetransmit(t *testing.T) { // of the recorded first zero probe transmission duration. // // Advertize zero receive window again. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) - probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum() - 1)) - ackProbe := testbench.Uint32(uint32(*conn.RemoteSeqNum())) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) + probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum(t) - 1)) + ackProbe := testbench.Uint32(uint32(*conn.RemoteSeqNum(t))) startProbeDuration := time.Second current := startProbeDuration first := time.Now() // Ask the dut to send out data. - dut.Send(acceptFd, sampleData, 0) + dut.Send(t, acceptFd, sampleData, 0) // Expect the dut to keep the connection alive as long as the remote is // acknowledging the zero-window probes. for i := 0; i < 5; i++ { @@ -79,7 +79,7 @@ func TestZeroWindowProbeRetransmit(t *testing.T) { // Expect zero-window probe with a timeout which is a function of the typical // first retransmission time. The retransmission times is supposed to // exponentially increase. - if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, 2*current); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: probeSeq}, nil, 2*current); err != nil { t.Fatalf("expected a probe with sequence number %d: loop %d", probeSeq, i) } if i == 0 { @@ -92,14 +92,13 @@ func TestZeroWindowProbeRetransmit(t *testing.T) { t.Errorf("got zero probe %d after %s, want >= %s", i, got, want) } // Acknowledge the zero-window probes from the dut. - conn.Send(testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) + conn.Send(t, testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) current *= 2 } // Advertize non-zero window. - conn.Send(testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck)}) // Expect the dut to recover and transmit data. - if _, err := conn.ExpectData(&testbench. - TCP{SeqNum: ackProbe}, samplePayload, time.Second); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: ackProbe}, samplePayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } } diff --git a/test/packetimpact/tests/tcp_zero_window_probe_test.go b/test/packetimpact/tests/tcp_zero_window_probe_test.go index 649fd5699..44cac42f8 100644 --- a/test/packetimpact/tests/tcp_zero_window_probe_test.go +++ b/test/packetimpact/tests/tcp_zero_window_probe_test.go @@ -33,29 +33,29 @@ func init() { func TestZeroWindowProbe(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFd) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) - conn.Connect() - acceptFd, _ := dut.Accept(listenFd) - defer dut.Close(acceptFd) + conn.Connect(t) + acceptFd, _ := dut.Accept(t, listenFd) + defer dut.Close(t, acceptFd) - dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) sampleData := []byte("Sample Data") samplePayload := &testbench.Payload{Bytes: sampleData} start := time.Now() // Send and receive sample data to the dut. - dut.Send(acceptFd, sampleData, 0) - if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil { + dut.Send(t, acceptFd, sampleData, 0) + if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } sendTime := time.Now().Sub(start) - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload) - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil { t.Fatalf("expected packet was not received: %s", err) } @@ -63,24 +63,24 @@ func TestZeroWindowProbe(t *testing.T) { // probe to be sent. // // Advertize zero window to the dut. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) // Expected sequence number of the zero window probe. - probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum() - 1)) + probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum(t) - 1)) // Expected ack number of the ACK for the probe. - ackProbe := testbench.Uint32(uint32(*conn.RemoteSeqNum())) + ackProbe := testbench.Uint32(uint32(*conn.RemoteSeqNum(t))) // Expect there are no zero-window probes sent until there is data to be sent out // from the dut. - if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, 2*time.Second); err == nil { + if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: probeSeq}, nil, 2*time.Second); err == nil { t.Fatalf("unexpected packet with sequence number %d: %s", probeSeq, err) } start = time.Now() // Ask the dut to send out data. - dut.Send(acceptFd, sampleData, 0) + dut.Send(t, acceptFd, sampleData, 0) // Expect zero-window probe from the dut. - if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, time.Second); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: probeSeq}, nil, time.Second); err != nil { t.Fatalf("expected a packet with sequence number %d: %s", probeSeq, err) } // Expect the probe to be sent after some time. Compare against the previous @@ -94,9 +94,9 @@ func TestZeroWindowProbe(t *testing.T) { // and sends out the sample payload after the send window opens. // // Advertize non-zero window to the dut and ack the zero window probe. - conn.Send(testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck)}) // Expect the dut to recover and transmit data. - if _, err := conn.ExpectData(&testbench.TCP{SeqNum: ackProbe}, samplePayload, time.Second); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: ackProbe}, samplePayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } @@ -104,9 +104,9 @@ func TestZeroWindowProbe(t *testing.T) { // Check if the dut responds as we do for a similar probe sent to it. // Basically with sequence number to one byte behind the unacknowledged // sequence number. - p := testbench.Uint32(uint32(*conn.LocalSeqNum())) - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), SeqNum: testbench.Uint32(uint32(*conn.LocalSeqNum() - 1))}) - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: p}, nil, time.Second); err != nil { + p := testbench.Uint32(uint32(*conn.LocalSeqNum(t))) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), SeqNum: testbench.Uint32(uint32(*conn.LocalSeqNum(t) - 1))}) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: p}, nil, time.Second); err != nil { t.Fatalf("expected a packet with ack number: %d: %s", p, err) } } diff --git a/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go b/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go index 3c467b14f..09a1c653f 100644 --- a/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go +++ b/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go @@ -33,27 +33,27 @@ func init() { func TestZeroWindowProbeUserTimeout(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - defer dut.Close(listenFd) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) - conn.Connect() - acceptFd, _ := dut.Accept(listenFd) - defer dut.Close(acceptFd) + conn.Connect(t) + acceptFd, _ := dut.Accept(t, listenFd) + defer dut.Close(t, acceptFd) - dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) sampleData := []byte("Sample Data") samplePayload := &testbench.Payload{Bytes: sampleData} // Send and receive sample data to the dut. - dut.Send(acceptFd, sampleData, 0) - if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil { + dut.Send(t, acceptFd, sampleData, 0) + if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { t.Fatalf("expected payload was not received: %s", err) } - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload) - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil { t.Fatalf("expected packet was not received: %s", err) } @@ -61,15 +61,15 @@ func TestZeroWindowProbeUserTimeout(t *testing.T) { // probe to be sent. // // Advertize zero window to the dut. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) // Expected sequence number of the zero window probe. - probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum() - 1)) + probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum(t) - 1)) start := time.Now() // Ask the dut to send out data. - dut.Send(acceptFd, sampleData, 0) + dut.Send(t, acceptFd, sampleData, 0) // Expect zero-window probe from the dut. - if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, time.Second); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: probeSeq}, nil, time.Second); err != nil { t.Fatalf("expected a packet with sequence number %d: %s", probeSeq, err) } // Record the duration for first probe, the dut sends the zero window probe after @@ -80,19 +80,19 @@ func TestZeroWindowProbeUserTimeout(t *testing.T) { // when the dut is sending zero-window probes. // // Reduce the retransmit timeout. - dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_USER_TIMEOUT, int32(startProbeDuration.Milliseconds())) + dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_USER_TIMEOUT, int32(startProbeDuration.Milliseconds())) // Advertize zero window again. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) // Ask the dut to send out data that would trigger zero window probe retransmissions. - dut.Send(acceptFd, sampleData, 0) + dut.Send(t, acceptFd, sampleData, 0) // Wait for the connection to timeout after multiple zero-window probe retransmissions. time.Sleep(8 * startProbeDuration) // Expect the connection to have timed out and closed which would cause the dut // to reply with a RST to the ACK we send. - conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) - if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { t.Fatalf("expected a TCP RST") } } diff --git a/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go b/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go index b0315e67c..d30177e64 100644 --- a/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go +++ b/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go @@ -36,11 +36,11 @@ func init() { func TestDiscardsUDPPacketsWithMcastSourceAddressV4(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - remoteFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP(testbench.RemoteIPv4)) - defer dut.Close(remoteFD) - dut.SetSockOptTimeval(remoteFD, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &oneSecond) + remoteFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP(testbench.RemoteIPv4)) + defer dut.Close(t, remoteFD) + dut.SetSockOptTimeval(t, remoteFD, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &oneSecond) conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) for _, mcastAddr := range []net.IP{ net.IPv4allsys, @@ -50,11 +50,12 @@ func TestDiscardsUDPPacketsWithMcastSourceAddressV4(t *testing.T) { } { t.Run(fmt.Sprintf("srcaddr=%s", mcastAddr), func(t *testing.T) { conn.SendIP( + t, testbench.IPv4{SrcAddr: testbench.Address(tcpip.Address(mcastAddr.To4()))}, testbench.UDP{}, ) - ret, payload, errno := dut.RecvWithErrno(context.Background(), remoteFD, 100, 0) + ret, payload, errno := dut.RecvWithErrno(context.Background(), t, remoteFD, 100, 0) if errno != syscall.EAGAIN || errno != syscall.EWOULDBLOCK { t.Errorf("Recv got unexpected result, ret=%d, payload=%q, errno=%s", ret, payload, errno) } @@ -65,11 +66,11 @@ func TestDiscardsUDPPacketsWithMcastSourceAddressV4(t *testing.T) { func TestDiscardsUDPPacketsWithMcastSourceAddressV6(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - remoteFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP(testbench.RemoteIPv6)) - defer dut.Close(remoteFD) - dut.SetSockOptTimeval(remoteFD, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &oneSecond) + remoteFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP(testbench.RemoteIPv6)) + defer dut.Close(t, remoteFD) + dut.SetSockOptTimeval(t, remoteFD, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &oneSecond) conn := testbench.NewUDPIPv6(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) for _, mcastAddr := range []net.IP{ net.IPv6interfacelocalallnodes, @@ -80,10 +81,11 @@ func TestDiscardsUDPPacketsWithMcastSourceAddressV6(t *testing.T) { } { t.Run(fmt.Sprintf("srcaddr=%s", mcastAddr), func(t *testing.T) { conn.SendIPv6( + t, testbench.IPv6{SrcAddr: testbench.Address(tcpip.Address(mcastAddr.To16()))}, testbench.UDP{}, ) - ret, payload, errno := dut.RecvWithErrno(context.Background(), remoteFD, 100, 0) + ret, payload, errno := dut.RecvWithErrno(context.Background(), t, remoteFD, 100, 0) if errno != syscall.EAGAIN || errno != syscall.EWOULDBLOCK { t.Errorf("Recv got unexpected result, ret=%d, payload=%q, errno=%s", ret, payload, errno) } diff --git a/test/packetimpact/tests/udp_icmp_error_propagation_test.go b/test/packetimpact/tests/udp_icmp_error_propagation_test.go index b754918f6..715e8f5b5 100644 --- a/test/packetimpact/tests/udp_icmp_error_propagation_test.go +++ b/test/packetimpact/tests/udp_icmp_error_propagation_test.go @@ -72,7 +72,7 @@ func (e icmpError) ToICMPv4() *testbench.ICMPv4 { type errorDetection struct { name string useValidConn bool - f func(context.Context, testData) error + f func(context.Context, *testing.T, testData) } type testData struct { @@ -95,12 +95,14 @@ func wantErrno(c connectionMode, icmpErr icmpError) syscall.Errno { } // sendICMPError sends an ICMP error message in response to a UDP datagram. -func sendICMPError(conn *testbench.UDPIPv4, icmpErr icmpError, udp *testbench.UDP) error { - layers := (*testbench.Connection)(conn).CreateFrame(nil) +func sendICMPError(t *testing.T, conn *testbench.UDPIPv4, icmpErr icmpError, udp *testbench.UDP) { + t.Helper() + + layers := (*testbench.Connection)(conn).CreateFrame(t, nil) layers = layers[:len(layers)-1] ip, ok := udp.Prev().(*testbench.IPv4) if !ok { - return fmt.Errorf("expected %s to be IPv4", udp.Prev()) + t.Fatalf("expected %s to be IPv4", udp.Prev()) } if icmpErr == timeToLiveExceeded { *ip.TTL = 1 @@ -114,84 +116,82 @@ func sendICMPError(conn *testbench.UDPIPv4, icmpErr icmpError, udp *testbench.UD // resulting in a mal-formed packet. layers = append(layers, icmpErr.ToICMPv4(), ip, udp) - (*testbench.Connection)(conn).SendFrameStateless(layers) - return nil + (*testbench.Connection)(conn).SendFrameStateless(t, layers) } // testRecv tests observing the ICMP error through the recv syscall. A packet // is sent to the DUT, and if wantErrno is non-zero, then the first recv should // fail and the second should succeed. Otherwise if wantErrno is zero then the // first recv should succeed immediately. -func testRecv(ctx context.Context, d testData) error { +func testRecv(ctx context.Context, t *testing.T, d testData) { + t.Helper() + // Check that receiving on the clean socket works. - d.conn.Send(testbench.UDP{DstPort: &d.cleanPort}) - d.dut.Recv(d.cleanFD, 100, 0) + d.conn.Send(t, testbench.UDP{DstPort: &d.cleanPort}) + d.dut.Recv(t, d.cleanFD, 100, 0) - d.conn.Send(testbench.UDP{}) + d.conn.Send(t, testbench.UDP{}) if d.wantErrno != syscall.Errno(0) { ctx, cancel := context.WithTimeout(ctx, time.Second) defer cancel() - ret, _, err := d.dut.RecvWithErrno(ctx, d.remoteFD, 100, 0) + ret, _, err := d.dut.RecvWithErrno(ctx, t, d.remoteFD, 100, 0) if ret != -1 { - return fmt.Errorf("recv after ICMP error succeeded unexpectedly, expected (%[1]d) %[1]v", d.wantErrno) + t.Fatalf("recv after ICMP error succeeded unexpectedly, expected (%[1]d) %[1]v", d.wantErrno) } if err != d.wantErrno { - return fmt.Errorf("recv after ICMP error resulted in error (%[1]d) %[1]v, expected (%[2]d) %[2]v", err, d.wantErrno) + t.Fatalf("recv after ICMP error resulted in error (%[1]d) %[1]v, expected (%[2]d) %[2]v", err, d.wantErrno) } } - d.dut.Recv(d.remoteFD, 100, 0) - return nil + d.dut.Recv(t, d.remoteFD, 100, 0) } // testSendTo tests observing the ICMP error through the send syscall. If // wantErrno is non-zero, the first send should fail and a subsequent send // should suceed; while if wantErrno is zero then the first send should just // succeed. -func testSendTo(ctx context.Context, d testData) error { +func testSendTo(ctx context.Context, t *testing.T, d testData) { // Check that sending on the clean socket works. - d.dut.SendTo(d.cleanFD, nil, 0, d.conn.LocalAddr()) - if _, err := d.conn.Expect(testbench.UDP{SrcPort: &d.cleanPort}, time.Second); err != nil { - return fmt.Errorf("did not receive UDP packet from clean socket on DUT: %s", err) + d.dut.SendTo(t, d.cleanFD, nil, 0, d.conn.LocalAddr(t)) + if _, err := d.conn.Expect(t, testbench.UDP{SrcPort: &d.cleanPort}, time.Second); err != nil { + t.Fatalf("did not receive UDP packet from clean socket on DUT: %s", err) } if d.wantErrno != syscall.Errno(0) { ctx, cancel := context.WithTimeout(ctx, time.Second) defer cancel() - ret, err := d.dut.SendToWithErrno(ctx, d.remoteFD, nil, 0, d.conn.LocalAddr()) + ret, err := d.dut.SendToWithErrno(ctx, t, d.remoteFD, nil, 0, d.conn.LocalAddr(t)) if ret != -1 { - return fmt.Errorf("sendto after ICMP error succeeded unexpectedly, expected (%[1]d) %[1]v", d.wantErrno) + t.Fatalf("sendto after ICMP error succeeded unexpectedly, expected (%[1]d) %[1]v", d.wantErrno) } if err != d.wantErrno { - return fmt.Errorf("sendto after ICMP error resulted in error (%[1]d) %[1]v, expected (%[2]d) %[2]v", err, d.wantErrno) + t.Fatalf("sendto after ICMP error resulted in error (%[1]d) %[1]v, expected (%[2]d) %[2]v", err, d.wantErrno) } } - d.dut.SendTo(d.remoteFD, nil, 0, d.conn.LocalAddr()) - if _, err := d.conn.Expect(testbench.UDP{}, time.Second); err != nil { - return fmt.Errorf("did not receive UDP packet as expected: %s", err) + d.dut.SendTo(t, d.remoteFD, nil, 0, d.conn.LocalAddr(t)) + if _, err := d.conn.Expect(t, testbench.UDP{}, time.Second); err != nil { + t.Fatalf("did not receive UDP packet as expected: %s", err) } - return nil } -func testSockOpt(_ context.Context, d testData) error { +func testSockOpt(_ context.Context, t *testing.T, d testData) { // Check that there's no pending error on the clean socket. - if errno := syscall.Errno(d.dut.GetSockOptInt(d.cleanFD, unix.SOL_SOCKET, unix.SO_ERROR)); errno != syscall.Errno(0) { - return fmt.Errorf("unexpected error (%[1]d) %[1]v on clean socket", errno) + if errno := syscall.Errno(d.dut.GetSockOptInt(t, d.cleanFD, unix.SOL_SOCKET, unix.SO_ERROR)); errno != syscall.Errno(0) { + t.Fatalf("unexpected error (%[1]d) %[1]v on clean socket", errno) } - if errno := syscall.Errno(d.dut.GetSockOptInt(d.remoteFD, unix.SOL_SOCKET, unix.SO_ERROR)); errno != d.wantErrno { - return fmt.Errorf("SO_ERROR sockopt after ICMP error is (%[1]d) %[1]v, expected (%[2]d) %[2]v", errno, d.wantErrno) + if errno := syscall.Errno(d.dut.GetSockOptInt(t, d.remoteFD, unix.SOL_SOCKET, unix.SO_ERROR)); errno != d.wantErrno { + t.Fatalf("SO_ERROR sockopt after ICMP error is (%[1]d) %[1]v, expected (%[2]d) %[2]v", errno, d.wantErrno) } // Check that after clearing socket error, sending doesn't fail. - d.dut.SendTo(d.remoteFD, nil, 0, d.conn.LocalAddr()) - if _, err := d.conn.Expect(testbench.UDP{}, time.Second); err != nil { - return fmt.Errorf("did not receive UDP packet as expected: %s", err) + d.dut.SendTo(t, d.remoteFD, nil, 0, d.conn.LocalAddr(t)) + if _, err := d.conn.Expect(t, testbench.UDP{}, time.Second); err != nil { + t.Fatalf("did not receive UDP packet as expected: %s", err) } - return nil } // TestUDPICMPErrorPropagation tests that ICMP error messages in response to @@ -227,31 +227,29 @@ func TestUDPICMPErrorPropagation(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - remoteFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0")) - defer dut.Close(remoteFD) + remoteFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0")) + defer dut.Close(t, remoteFD) // Create a second, clean socket on the DUT to ensure that the ICMP // error messages only affect the sockets they are intended for. - cleanFD, cleanPort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0")) - defer dut.Close(cleanFD) + cleanFD, cleanPort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0")) + defer dut.Close(t, cleanFD) conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) if connect { - dut.Connect(remoteFD, conn.LocalAddr()) - dut.Connect(cleanFD, conn.LocalAddr()) + dut.Connect(t, remoteFD, conn.LocalAddr(t)) + dut.Connect(t, cleanFD, conn.LocalAddr(t)) } - dut.SendTo(remoteFD, nil, 0, conn.LocalAddr()) - udp, err := conn.Expect(testbench.UDP{}, time.Second) + dut.SendTo(t, remoteFD, nil, 0, conn.LocalAddr(t)) + udp, err := conn.Expect(t, testbench.UDP{}, time.Second) if err != nil { t.Fatalf("did not receive message from DUT: %s", err) } - if err := sendICMPError(&conn, icmpErr, udp); err != nil { - t.Fatal(err) - } + sendICMPError(t, &conn, icmpErr, udp) errDetectConn := &conn if errDetect.useValidConn { @@ -260,14 +258,12 @@ func TestUDPICMPErrorPropagation(t *testing.T) { // interactions between it and the the DUT should be independent of // the ICMP error at least at the port level. connClean := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) - defer connClean.Close() + defer connClean.Close(t) errDetectConn = &connClean } - if err := errDetect.f(context.Background(), testData{&dut, errDetectConn, remoteFD, remotePort, cleanFD, cleanPort, wantErrno}); err != nil { - t.Fatal(err) - } + errDetect.f(context.Background(), t, testData{&dut, errDetectConn, remoteFD, remotePort, cleanFD, cleanPort, wantErrno}) }) } } @@ -285,24 +281,24 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - remoteFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0")) - defer dut.Close(remoteFD) + remoteFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0")) + defer dut.Close(t, remoteFD) // Create a second, clean socket on the DUT to ensure that the ICMP // error messages only affect the sockets they are intended for. - cleanFD, cleanPort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0")) - defer dut.Close(cleanFD) + cleanFD, cleanPort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0")) + defer dut.Close(t, cleanFD) conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) if connect { - dut.Connect(remoteFD, conn.LocalAddr()) - dut.Connect(cleanFD, conn.LocalAddr()) + dut.Connect(t, remoteFD, conn.LocalAddr(t)) + dut.Connect(t, cleanFD, conn.LocalAddr(t)) } - dut.SendTo(remoteFD, nil, 0, conn.LocalAddr()) - udp, err := conn.Expect(testbench.UDP{}, time.Second) + dut.SendTo(t, remoteFD, nil, 0, conn.LocalAddr(t)) + udp, err := conn.Expect(t, testbench.UDP{}, time.Second) if err != nil { t.Fatalf("did not receive message from DUT: %s", err) } @@ -316,7 +312,7 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - ret, _, err := dut.RecvWithErrno(ctx, remoteFD, 100, 0) + ret, _, err := dut.RecvWithErrno(ctx, t, remoteFD, 100, 0) if ret != -1 { t.Errorf("recv during ICMP error succeeded unexpectedly, expected (%[1]d) %[1]v", wantErrno) return @@ -330,7 +326,7 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - if ret, _, err := dut.RecvWithErrno(ctx, remoteFD, 100, 0); ret == -1 { + if ret, _, err := dut.RecvWithErrno(ctx, t, remoteFD, 100, 0); ret == -1 { t.Errorf("recv after ICMP error failed with (%[1]d) %[1]", err) } }() @@ -341,7 +337,7 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - if ret, _, err := dut.RecvWithErrno(ctx, cleanFD, 100, 0); ret == -1 { + if ret, _, err := dut.RecvWithErrno(ctx, t, cleanFD, 100, 0); ret == -1 { t.Errorf("recv on clean socket failed with (%[1]d) %[1]", err) } }() @@ -352,12 +348,10 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) { // alternative is available. time.Sleep(2 * time.Second) - if err := sendICMPError(&conn, icmpErr, udp); err != nil { - t.Fatal(err) - } + sendICMPError(t, &conn, icmpErr, udp) - conn.Send(testbench.UDP{DstPort: &cleanPort}) - conn.Send(testbench.UDP{}) + conn.Send(t, testbench.UDP{DstPort: &cleanPort}) + conn.Send(t, testbench.UDP{}) wg.Wait() }) } diff --git a/test/packetimpact/tests/udp_recv_mcast_bcast_test.go b/test/packetimpact/tests/udp_recv_mcast_bcast_test.go index 263a54291..fcd202643 100644 --- a/test/packetimpact/tests/udp_recv_mcast_bcast_test.go +++ b/test/packetimpact/tests/udp_recv_mcast_bcast_test.go @@ -31,10 +31,10 @@ func init() { func TestUDPRecvMulticastBroadcast(t *testing.T) { dut := testbench.NewDUT(t) defer dut.TearDown() - boundFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.IPv4(0, 0, 0, 0)) - defer dut.Close(boundFD) + boundFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.IPv4(0, 0, 0, 0)) + defer dut.Close(t, boundFD) conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) - defer conn.Close() + defer conn.Close(t) for _, bcastAddr := range []net.IP{ broadcastAddr(net.ParseIP(testbench.RemoteIPv4), net.CIDRMask(testbench.IPv4PrefixLength, 32)), @@ -43,12 +43,13 @@ func TestUDPRecvMulticastBroadcast(t *testing.T) { } { payload := testbench.GenerateRandomPayload(t, 1<<10) conn.SendIP( + t, testbench.IPv4{DstAddr: testbench.Address(tcpip.Address(bcastAddr.To4()))}, testbench.UDP{}, &testbench.Payload{Bytes: payload}, ) t.Logf("Receiving packet sent to address: %s", bcastAddr) - if got, want := string(dut.Recv(boundFD, int32(len(payload)), 0)), string(payload); got != want { + if got, want := string(dut.Recv(t, boundFD, int32(len(payload)), 0)), string(payload); got != want { t.Errorf("received payload does not match sent payload got: %s, want: %s", got, want) } } diff --git a/test/packetimpact/tests/udp_send_recv_dgram_test.go b/test/packetimpact/tests/udp_send_recv_dgram_test.go index bd53ad90b..dc20275d6 100644 --- a/test/packetimpact/tests/udp_send_recv_dgram_test.go +++ b/test/packetimpact/tests/udp_send_recv_dgram_test.go @@ -29,10 +29,10 @@ func init() { } type udpConn interface { - Send(testbench.UDP, ...testbench.Layer) - ExpectData(testbench.UDP, testbench.Payload, time.Duration) (testbench.Layers, error) - Drain() - Close() + Send(*testing.T, testbench.UDP, ...testbench.Layer) + ExpectData(*testing.T, testbench.UDP, testbench.Payload, time.Duration) (testbench.Layers, error) + Drain(*testing.T) + Close(*testing.T) } func TestUDP(t *testing.T) { @@ -51,21 +51,21 @@ func TestUDP(t *testing.T) { } else { addr = testbench.RemoteIPv6 } - boundFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP(addr)) - defer dut.Close(boundFD) + boundFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP(addr)) + defer dut.Close(t, boundFD) var conn udpConn var localAddr unix.Sockaddr if isIPv4 { v4Conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) - localAddr = v4Conn.LocalAddr() + localAddr = v4Conn.LocalAddr(t) conn = &v4Conn } else { v6Conn := testbench.NewUDPIPv6(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) - localAddr = v6Conn.LocalAddr() + localAddr = v6Conn.LocalAddr(t) conn = &v6Conn } - defer conn.Close() + defer conn.Close(t) testCases := []struct { name string @@ -81,17 +81,17 @@ func TestUDP(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { t.Run("Send", func(t *testing.T) { - conn.Send(testbench.UDP{}, &testbench.Payload{Bytes: tc.payload}) - if got, want := string(dut.Recv(boundFD, int32(len(tc.payload)), 0)), string(tc.payload); got != want { + conn.Send(t, testbench.UDP{}, &testbench.Payload{Bytes: tc.payload}) + if got, want := string(dut.Recv(t, boundFD, int32(len(tc.payload)), 0)), string(tc.payload); got != want { t.Fatalf("received payload does not match sent payload got: %s, want: %s", got, want) } }) t.Run("Recv", func(t *testing.T) { - conn.Drain() - if got, want := int(dut.SendTo(boundFD, tc.payload, 0, localAddr)), len(tc.payload); got != want { + conn.Drain(t) + if got, want := int(dut.SendTo(t, boundFD, tc.payload, 0, localAddr)), len(tc.payload); got != want { t.Fatalf("short write got: %d, want: %d", got, want) } - if _, err := conn.ExpectData(testbench.UDP{SrcPort: &remotePort}, testbench.Payload{Bytes: tc.payload}, time.Second); err != nil { + if _, err := conn.ExpectData(t, testbench.UDP{SrcPort: &remotePort}, testbench.Payload{Bytes: tc.payload}, time.Second); err != nil { t.Fatal(err) } }) diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go index 193705ab8..df91fa0fe 100644 --- a/test/root/crictl_test.go +++ b/test/root/crictl_test.go @@ -405,11 +405,8 @@ func setup(t *testing.T, version string) (*criutil.Crictl, func(), error) { } // We provide the shim, followed by the runtime, and then a - // temporary root directory. Note that we can safely assume - // that the shim has been installed in the same directory as - // the runtime (for test installs and for normal installs). - // Since this is v1, the binary name will be fixed. - config = fmt.Sprintf(v1Template, path.Join(runtimeDir, "gvisor-containerd-shim"), runtime, runtimeDir) + // temporary root directory. + config = fmt.Sprintf(v1Template, criutil.ResolvePath("gvisor-containerd-shim"), runtime, containerdRoot) case v2: // This is only supported past 1.2. if major < 1 || (major == 1 && minor <= 1) { diff --git a/test/runner/BUILD b/test/runner/BUILD index 1f45a6922..63c7ec83a 100644 --- a/test/runner/BUILD +++ b/test/runner/BUILD @@ -17,6 +17,7 @@ go_binary( "//test/runner/gtest", "//test/uds", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + "@com_github_syndtr_gocapability//capability:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl index 600cb5192..c92392b35 100644 --- a/test/runner/defs.bzl +++ b/test/runner/defs.bzl @@ -157,7 +157,7 @@ def syscall_test( platform = "native", use_tmpfs = False, add_uds_tree = add_uds_tree, - tags = tags, + tags = list(tags), ) for (platform, platform_tags) in platforms.items(): diff --git a/test/runner/runner.go b/test/runner/runner.go index 2296f3a46..bc4b39cbb 100644 --- a/test/runner/runner.go +++ b/test/runner/runner.go @@ -30,6 +30,7 @@ import ( "time" specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/syndtr/gocapability/capability" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/test/testutil" @@ -105,6 +106,13 @@ func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) { cmd.Env = env cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr + + if specutils.HasCapabilities(capability.CAP_NET_ADMIN) { + cmd.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: syscall.CLONE_NEWNET, + } + } + if err := cmd.Run(); err != nil { ws := err.(*exec.ExitError).Sys().(syscall.WaitStatus) t.Errorf("test %q exited with status %d, want 0", tc.FullName(), ws.ExitStatus()) diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index c06a75ada..c19b30b4a 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -400,6 +400,7 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:open_test", + vfs2 = "True", ) syscall_test( @@ -640,11 +641,13 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:sendfile_socket_test", + vfs2 = "True", ) syscall_test( add_overlay = True, test = "//test/syscalls/linux:sendfile_test", + vfs2 = "True", ) syscall_test( diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 662d780d8..66a31cd28 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -943,6 +943,7 @@ cc_binary( "//test/util:eventfd_util", "//test/util:file_descriptor", "//test/util:fs_util", + "@com_google_absl//absl/container:node_hash_set", "@com_google_absl//absl/strings", gtest, "//test/util:posix_error", diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc index 6fa16208e..1d0d584cd 100644 --- a/test/syscalls/linux/dev.cc +++ b/test/syscalls/linux/dev.cc @@ -161,6 +161,19 @@ TEST(DevTest, OpenDevFuse) { ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/fuse", O_RDONLY)); } +TEST(DevTest, ReadDevFuseWithoutMount) { + // Note(gvisor.dev/issue/3076) This won't work in the sentry until the new + // device registration is complete. + SKIP_IF(IsRunningWithVFS1() || IsRunningOnGvisor()); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/fuse", O_RDONLY)); + + std::vector<char> buf(1); + EXPECT_THAT(ReadFd(fd.get(), buf.data(), sizeof(buf)), + SyscallFailsWithErrno(EPERM)); +} + } // namespace } // namespace testing diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc index 40c80a6e1..90b1f0508 100644 --- a/test/syscalls/linux/futex.cc +++ b/test/syscalls/linux/futex.cc @@ -18,6 +18,7 @@ #include <sys/syscall.h> #include <sys/time.h> #include <sys/types.h> +#include <syscall.h> #include <unistd.h> #include <algorithm> @@ -737,6 +738,97 @@ TEST_P(PrivateAndSharedFutexTest, PITryLockConcurrency_NoRandomSave) { } } +int get_robust_list(int pid, struct robust_list_head** head_ptr, + size_t* len_ptr) { + return syscall(__NR_get_robust_list, pid, head_ptr, len_ptr); +} + +int set_robust_list(struct robust_list_head* head, size_t len) { + return syscall(__NR_set_robust_list, head, len); +} + +TEST(RobustFutexTest, BasicSetGet) { + struct robust_list_head hd = {}; + struct robust_list_head* hd_ptr = &hd; + + // Set! + EXPECT_THAT(set_robust_list(hd_ptr, sizeof(hd)), SyscallSucceedsWithValue(0)); + + // Get! + struct robust_list_head* new_hd_ptr = hd_ptr; + size_t len; + EXPECT_THAT(get_robust_list(0, &new_hd_ptr, &len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(new_hd_ptr, hd_ptr); + EXPECT_EQ(len, sizeof(hd)); +} + +TEST(RobustFutexTest, GetFromOtherTid) { + // Get the current tid and list head. + pid_t tid = gettid(); + struct robust_list_head* hd_ptr = {}; + size_t len; + EXPECT_THAT(get_robust_list(0, &hd_ptr, &len), SyscallSucceedsWithValue(0)); + + // Create a new thread. + ScopedThread t([&] { + // Current tid list head should be different from parent tid. + struct robust_list_head* got_hd_ptr = {}; + EXPECT_THAT(get_robust_list(0, &got_hd_ptr, &len), + SyscallSucceedsWithValue(0)); + EXPECT_NE(hd_ptr, got_hd_ptr); + + // Get the parent list head by passing its tid. + EXPECT_THAT(get_robust_list(tid, &got_hd_ptr, &len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(hd_ptr, got_hd_ptr); + }); + + // Wait for thread. + t.Join(); +} + +TEST(RobustFutexTest, InvalidSize) { + struct robust_list_head* hd = {}; + EXPECT_THAT(set_robust_list(hd, sizeof(*hd) + 1), + SyscallFailsWithErrno(EINVAL)); +} + +TEST(RobustFutexTest, PthreadMutexAttr) { + constexpr int kNumMutexes = 3; + + // Create a bunch of robust mutexes. + pthread_mutexattr_t attrs[kNumMutexes]; + pthread_mutex_t mtxs[kNumMutexes]; + for (int i = 0; i < kNumMutexes; i++) { + TEST_PCHECK(pthread_mutexattr_init(&attrs[i]) == 0); + TEST_PCHECK(pthread_mutexattr_setrobust(&attrs[i], PTHREAD_MUTEX_ROBUST) == + 0); + TEST_PCHECK(pthread_mutex_init(&mtxs[i], &attrs[i]) == 0); + } + + // Start thread to lock the mutexes and then exit. + ScopedThread t([&] { + for (int i = 0; i < kNumMutexes; i++) { + TEST_PCHECK(pthread_mutex_lock(&mtxs[i]) == 0); + } + pthread_exit(NULL); + }); + + // Wait for thread. + t.Join(); + + // Now try to take the mutexes. + for (int i = 0; i < kNumMutexes; i++) { + // Should get EOWNERDEAD. + EXPECT_EQ(pthread_mutex_lock(&mtxs[i]), EOWNERDEAD); + // Make the mutex consistent. + EXPECT_EQ(pthread_mutex_consistent(&mtxs[i]), 0); + // Unlock. + EXPECT_EQ(pthread_mutex_unlock(&mtxs[i]), 0); + } +} + } // namespace } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/getdents.cc b/test/syscalls/linux/getdents.cc index b147d6181..b040cdcf7 100644 --- a/test/syscalls/linux/getdents.cc +++ b/test/syscalls/linux/getdents.cc @@ -32,6 +32,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "absl/container/node_hash_set.h" #include "absl/strings/numbers.h" #include "absl/strings/str_cat.h" #include "test/util/eventfd_util.h" @@ -393,7 +394,7 @@ TYPED_TEST(GetdentsTest, ProcSelfFd) { // Make the buffer very small since we want to iterate. typename TestFixture::DirentBufferType dirents( 2 * sizeof(typename TestFixture::LinuxDirentType)); - std::unordered_set<int> prev_fds; + absl::node_hash_set<int> prev_fds; while (true) { dirents.Reset(); int rv; diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc index a3e9745cf..97e8d0f7e 100644 --- a/test/syscalls/linux/mount.cc +++ b/test/syscalls/linux/mount.cc @@ -321,6 +321,28 @@ TEST(MountTest, RenameRemoveMountPoint) { ASSERT_THAT(rmdir(dir.path().c_str()), SyscallFailsWithErrno(EBUSY)); } +TEST(MountTest, MountFuseFilesystemNoDevice) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + SKIP_IF(IsRunningOnGvisor() && !IsFUSEEnabled()); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + EXPECT_THAT(mount("", dir.path().c_str(), "fuse", 0, ""), + SyscallFailsWithErrno(EINVAL)); +} + +TEST(MountTest, MountFuseFilesystem) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + SKIP_IF(IsRunningOnGvisor() && !IsFUSEEnabled()); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/fuse", O_WRONLY)); + std::string mopts = "fd=" + std::to_string(fd.get()); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + auto const mount = + ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "fuse", 0, mopts, 0)); +} + } // namespace } // namespace testing diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc index bb7d108e8..bf350946b 100644 --- a/test/syscalls/linux/open.cc +++ b/test/syscalls/linux/open.cc @@ -235,7 +235,7 @@ TEST_F(OpenTest, AppendOnly) { ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR | O_APPEND)); EXPECT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0)); - // Then try to write to the first file and make sure the bytes are appended. + // Then try to write to the first fd and make sure the bytes are appended. EXPECT_THAT(WriteFd(fd1.get(), buf.data(), buf.size()), SyscallSucceedsWithValue(buf.size())); @@ -247,7 +247,7 @@ TEST_F(OpenTest, AppendOnly) { EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(kBufSize * 2)); - // Then try to write to the second file and make sure the bytes are appended. + // Then try to write to the second fd and make sure the bytes are appended. EXPECT_THAT(WriteFd(fd2.get(), buf.data(), buf.size()), SyscallSucceedsWithValue(buf.size())); diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc index e94ddcb77..40aa9326d 100644 --- a/test/syscalls/linux/packet_socket.cc +++ b/test/syscalls/linux/packet_socket.cc @@ -417,6 +417,122 @@ TEST_P(CookedPacketTest, BindDrop) { EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0)); } +// Verify that we receive outbound packets. This test requires at least one +// non loopback interface so that we can actually capture an outgoing packet. +TEST_P(CookedPacketTest, ReceiveOutbound) { + // Only ETH_P_ALL sockets can receive outbound packets on linux. + SKIP_IF(GetParam() != ETH_P_ALL); + + // Let's use a simple IP payload: a UDP datagram. + FileDescriptor udp_sock = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); + + struct ifaddrs* if_addr_list = nullptr; + auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); }); + + ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds()); + + // Get interface other than loopback. + struct ifreq ifr = {}; + for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) { + if (strcmp(i->ifa_name, "lo") != 0) { + strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name)); + break; + } + } + + // Skip if no interface is available other than loopback. + if (strlen(ifr.ifr_name) == 0) { + GTEST_SKIP(); + } + + // Get interface index and name. + EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds()); + EXPECT_NE(ifr.ifr_ifindex, 0); + int ifindex = ifr.ifr_ifindex; + + constexpr int kMACSize = 6; + char hwaddr[kMACSize]; + // Get interface address. + ASSERT_THAT(ioctl(socket_, SIOCGIFHWADDR, &ifr), SyscallSucceeds()); + ASSERT_THAT(ifr.ifr_hwaddr.sa_family, + AnyOf(Eq(ARPHRD_NONE), Eq(ARPHRD_ETHER))); + memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, kMACSize); + + // Just send it to the google dns server 8.8.8.8. It's UDP we don't care + // if it actually gets to the DNS Server we just want to see that we receive + // it on our AF_PACKET socket. + // + // NOTE: We just want to pick an IP that is non-local to avoid having to + // handle ARP as this should cause the UDP packet to be sent to the default + // gateway configured for the system under test. Otherwise the only packet we + // will see is the ARP query unless we picked an IP which will actually + // resolve. The test is a bit brittle but this was the best compromise for + // now. + struct sockaddr_in dest = {}; + ASSERT_EQ(inet_pton(AF_INET, "8.8.8.8", &dest.sin_addr.s_addr), 1); + dest.sin_family = AF_INET; + dest.sin_port = kPort; + EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0, + reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)), + SyscallSucceedsWithValue(sizeof(kMessage))); + + // Wait and make sure the socket receives the data. + struct pollfd pfd = {}; + pfd.fd = socket_; + pfd.events = POLLIN; + EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(1)); + + // Now read and check that the packet is the one we just sent. + // Read and verify the data. + constexpr size_t packet_size = + sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kMessage); + char buf[64]; + struct sockaddr_ll src = {}; + socklen_t src_len = sizeof(src); + ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0, + reinterpret_cast<struct sockaddr*>(&src), &src_len), + SyscallSucceedsWithValue(packet_size)); + + // sockaddr_ll ends with an 8 byte physical address field, but ethernet + // addresses only use 6 bytes. Linux used to return sizeof(sockaddr_ll)-2 + // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns + // sizeof(sockaddr_ll). + ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2))); + + // Verify the source address. + EXPECT_EQ(src.sll_family, AF_PACKET); + EXPECT_EQ(src.sll_ifindex, ifindex); + EXPECT_EQ(src.sll_halen, ETH_ALEN); + EXPECT_EQ(ntohs(src.sll_protocol), ETH_P_IP); + EXPECT_EQ(src.sll_pkttype, PACKET_OUTGOING); + // Verify the link address of the interface matches that of the non + // non loopback interface address we stored above. + for (int i = 0; i < src.sll_halen; i++) { + EXPECT_EQ(src.sll_addr[i], hwaddr[i]); + } + + // Verify the IP header. + struct iphdr ip = {}; + memcpy(&ip, buf, sizeof(ip)); + EXPECT_EQ(ip.ihl, 5); + EXPECT_EQ(ip.version, 4); + EXPECT_EQ(ip.tot_len, htons(packet_size)); + EXPECT_EQ(ip.protocol, IPPROTO_UDP); + EXPECT_EQ(ip.daddr, dest.sin_addr.s_addr); + EXPECT_NE(ip.saddr, htonl(INADDR_LOOPBACK)); + + // Verify the UDP header. + struct udphdr udp = {}; + memcpy(&udp, buf + sizeof(iphdr), sizeof(udp)); + EXPECT_EQ(udp.dest, kPort); + EXPECT_EQ(udp.len, htons(sizeof(udphdr) + sizeof(kMessage))); + + // Verify the payload. + char* payload = reinterpret_cast<char*>(buf + sizeof(iphdr) + sizeof(udphdr)); + EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0); +} + // Bind with invalid address. TEST_P(CookedPacketTest, BindFail) { // Null address. diff --git a/test/syscalls/linux/raw_socket.cc b/test/syscalls/linux/raw_socket.cc index ce54dc064..8d6e5c913 100644 --- a/test/syscalls/linux/raw_socket.cc +++ b/test/syscalls/linux/raw_socket.cc @@ -262,6 +262,27 @@ TEST_P(RawSocketTest, SendWithoutConnectFails) { SyscallFailsWithErrno(EDESTADDRREQ)); } +// Wildcard Bind. +TEST_P(RawSocketTest, BindToWildcard) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + struct sockaddr_storage addr; + addr = {}; + + // We don't set ports because raw sockets don't have a notion of ports. + if (Family() == AF_INET) { + struct sockaddr_in* sin = reinterpret_cast<struct sockaddr_in*>(&addr); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = htonl(INADDR_ANY); + } else { + struct sockaddr_in6* sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr); + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = in6addr_any; + } + + ASSERT_THAT(bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), + SyscallSucceeds()); +} + // Bind to localhost. TEST_P(RawSocketTest, BindToLocalhost) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); diff --git a/test/syscalls/linux/raw_socket_hdrincl.cc b/test/syscalls/linux/raw_socket_hdrincl.cc index 5bb14d57c..97f0467aa 100644 --- a/test/syscalls/linux/raw_socket_hdrincl.cc +++ b/test/syscalls/linux/raw_socket_hdrincl.cc @@ -178,6 +178,9 @@ TEST_F(RawHDRINCL, ConnectToLoopback) { } TEST_F(RawHDRINCL, SendWithoutConnectSucceeds) { + // FIXME(github.dev/issue/3159): Test currently flaky. + SKIP_IF(true); + struct iphdr hdr = LoopbackHeader(); ASSERT_THAT(send(socket_, &hdr, sizeof(hdr), 0), SyscallSucceedsWithValue(sizeof(hdr))); @@ -281,6 +284,9 @@ TEST_F(RawHDRINCL, SendAndReceive) { // Send and receive a packet where the sendto address is not the same as the // provided destination. TEST_F(RawHDRINCL, SendAndReceiveDifferentAddress) { + // FIXME(github.dev/issue/3160): Test currently flaky. + SKIP_IF(true); + int port = 40000; if (!IsRunningOnGvisor()) { port = static_cast<short>(ASSERT_NO_ERRNO_AND_VALUE( diff --git a/tools/bazel.mk b/tools/bazel.mk index 9e02af8dc..45d6007cf 100644 --- a/tools/bazel.mk +++ b/tools/bazel.mk @@ -15,6 +15,7 @@ # limitations under the License. # See base Makefile. +SHELL=/bin/bash -o pipefail BRANCH_NAME := $(shell (git branch --show-current 2>/dev/null || \ git rev-parse --abbrev-ref HEAD 2>/dev/null) | \ xargs -n 1 basename 2>/dev/null) @@ -22,22 +23,38 @@ BRANCH_NAME := $(shell (git branch --show-current 2>/dev/null || \ # Bazel container configuration (see below). USER ?= gvisor HASH ?= $(shell readlink -m $(CURDIR) | md5sum | cut -c1-8) +BUILDER_BASE := gvisor.dev/images/default +BUILDER_IMAGE := gvisor.dev/images/builder +BUILDER_NAME ?= gvisor-builder-$(HASH) DOCKER_NAME ?= gvisor-bazel-$(HASH) DOCKER_PRIVILEGED ?= --privileged BAZEL_CACHE := $(shell readlink -m ~/.cache/bazel/) GCLOUD_CONFIG := $(shell readlink -m ~/.config/gcloud/) DOCKER_SOCKET := /var/run/docker.sock -# Non-configurable. +# Bazel flags. +OPTIONS += --test_output=errors --keep_going --verbose_failures=true +BAZEL := bazel $(STARTUP_OPTIONS) + +# Basic options. UID := $(shell id -u ${USER}) GID := $(shell id -g ${USER}) USERADD_OPTIONS := FULL_DOCKER_RUN_OPTIONS := $(DOCKER_RUN_OPTIONS) +FULL_DOCKER_RUN_OPTIONS += --user $(UID):$(GID) +FULL_DOCKER_RUN_OPTIONS += --entrypoint "" +FULL_DOCKER_RUN_OPTIONS += --init FULL_DOCKER_RUN_OPTIONS += -v "$(BAZEL_CACHE):$(BAZEL_CACHE)" FULL_DOCKER_RUN_OPTIONS += -v "$(GCLOUD_CONFIG):$(GCLOUD_CONFIG)" FULL_DOCKER_RUN_OPTIONS += -v "/tmp:/tmp" +FULL_DOCKER_EXEC_OPTIONS := --user $(UID):$(GID) +FULL_DOCKER_EXEC_OPTIONS += -i + +# Add docker passthrough options. ifneq ($(DOCKER_PRIVILEGED),) FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_SOCKET):$(DOCKER_SOCKET)" +FULL_DOCKER_RUN_OPTIONS += $(DOCKER_PRIVILEGED) +FULL_DOCKER_EXEC_OPTIONS += $(DOCKER_PRIVILEGED) DOCKER_GROUP := $(shell stat -c '%g' $(DOCKER_SOCKET)) ifneq ($(GID),$(DOCKER_GROUP)) USERADD_OPTIONS += --groups $(DOCKER_GROUP) @@ -45,7 +62,35 @@ GROUPADD_DOCKER += groupadd --gid $(DOCKER_GROUP) --non-unique docker-$(HASH) && FULL_DOCKER_RUN_OPTIONS += --group-add $(DOCKER_GROUP) endif endif -SHELL=/bin/bash -o pipefail + +# Add KVM passthrough options. +ifneq (,$(wildcard /dev/kvm)) +FULL_DOCKER_RUN_OPTIONS += --device=/dev/kvm +KVM_GROUP := $(shell stat -c '%g' /dev/kvm) +ifneq ($(GID),$(KVM_GROUP)) +USERADD_OPTIONS += --groups $(KVM_GROUP) +GROUPADD_DOCKER += groupadd --gid $(KVM_GROUP) --non-unique kvm-$(HASH) && +FULL_DOCKER_RUN_OPTIONS += --group-add $(KVM_GROUP) +endif +endif + +# Load the appropriate config. +ifneq (,$(BAZEL_CONFIG)) +OPTIONS += --config=$(BAZEL_CONFIG) +endif + +bazel-image: load-default + @if docker ps --all | grep $(BUILDER_NAME); then docker rm -f $(BUILDER_NAME); fi + docker run --user 0:0 --entrypoint "" --name $(BUILDER_NAME) \ + $(BUILDER_BASE) \ + sh -c "groupadd --gid $(GID) --non-unique $(USER) && \ + $(GROUPADD_DOCKER) \ + useradd --uid $(UID) --non-unique --no-create-home \ + --gid $(GID) $(USERADD_OPTIONS) -d $(HOME) $(USER) && \ + if [[ -e /dev/kvm ]]; then chmod a+rw /dev/kvm; fi" + docker commit $(BUILDER_NAME) $(BUILDER_IMAGE) + @docker rm -f $(BUILDER_NAME) +.PHONY: bazel-image ## ## Bazel helpers. @@ -60,40 +105,37 @@ SHELL=/bin/bash -o pipefail ## GCLOUD_CONFIG - The gcloud config directory (detect: detected). ## DOCKER_SOCKET - The Docker socket (default: detected). ## -bazel-server-start: load-default ## Starts the bazel server. +bazel-server-start: bazel-image ## Starts the bazel server. @mkdir -p $(BAZEL_CACHE) @mkdir -p $(GCLOUD_CONFIG) - docker run -d --rm \ - --init \ - --name $(DOCKER_NAME) \ - --user 0:0 $(DOCKER_GROUP_OPTIONS) \ + @if docker ps --all | grep $(DOCKER_NAME); then docker rm -f $(DOCKER_NAME); fi + # This command runs a bazel server, and the container sticks around + # until the bazel server exits. This should ensure that it does not + # exit in the middle of running a build, but also it won't stick around + # forever. The build commands wrap around an appropriate exec into the + # container in order to perform work via the bazel client. + docker run -d --rm --name $(DOCKER_NAME) \ -v "$(CURDIR):$(CURDIR)" \ --workdir "$(CURDIR)" \ - --entrypoint "" \ $(FULL_DOCKER_RUN_OPTIONS) \ - gvisor.dev/images/default \ - sh -c "groupadd --gid $(GID) --non-unique $(USER) && \ - $(GROUPADD_DOCKER) \ - useradd --uid $(UID) --non-unique --no-create-home --gid $(GID) $(USERADD_OPTIONS) -d $(HOME) $(USER) && \ - bazel version && \ - exec tail --pid=\$$(bazel info server_pid) -f /dev/null" - @while :; do if docker logs $(DOCKER_NAME) 2>/dev/null | grep "Build label:" >/dev/null; then break; fi; \ - if ! docker ps | grep $(DOCKER_NAME); then exit 1; else sleep 1; fi; done + $(BUILDER_IMAGE) \ + sh -c "tail -f --pid=\$$($(BAZEL) info server_pid)" .PHONY: bazel-server-start bazel-shutdown: ## Shuts down a running bazel server. - @docker exec --user $(UID):$(GID) $(DOCKER_NAME) bazel shutdown; rc=$$?; docker kill $(DOCKER_NAME) || [[ $$rc -ne 0 ]] + @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) $(BAZEL) shutdown; \ + rc=$$?; docker kill $(DOCKER_NAME) || [[ $$rc -ne 0 ]] .PHONY: bazel-shutdown bazel-alias: ## Emits an alias that can be used within the shell. - @echo "alias bazel='docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) bazel'" + @echo "alias bazel='docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) bazel'" .PHONY: bazel-alias bazel-server: ## Ensures that the server exists. Used as an internal target. - @docker exec $(DOCKER_NAME) true || $(MAKE) bazel-server-start + @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) true || $(MAKE) bazel-server-start .PHONY: bazel-server -build_cmd = docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) sh -o pipefail -c 'bazel $(STARTUP_OPTIONS) build $(OPTIONS) $(TARGETS)' +build_cmd = docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) sh -o pipefail -c '$(BAZEL) build $(OPTIONS) $(TARGETS)' build_paths = $(build_cmd) 2>&1 \ | tee /proc/self/fd/2 \ @@ -120,5 +162,9 @@ sudo: bazel-server .PHONY: sudo test: bazel-server - @docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) bazel $(STARTUP_OPTIONS) test $(OPTIONS) $(TARGETS) + @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) $(BAZEL) test $(OPTIONS) $(TARGETS) .PHONY: test + +query: bazel-server + @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) $(BAZEL) query $(OPTIONS) '$(TARGETS)' +.PHONY: query diff --git a/tools/bazeldefs/BUILD b/tools/bazeldefs/BUILD index f2f80bae1..3f809065d 100644 --- a/tools/bazeldefs/BUILD +++ b/tools/bazeldefs/BUILD @@ -49,3 +49,40 @@ rbe_toolchain( toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8", toolchain_type = "@bazel_tools//tools/cpp:toolchain_type", ) + +# Updated versions of the above, compatible with bazel3. +rbe_platform( + name = "rbe_ubuntu1604_bazel3", + constraint_values = [ + "@bazel_tools//platforms:x86_64", + "@bazel_tools//platforms:linux", + "@bazel_tools//tools/cpp:clang", + "@bazel_toolchains_bazel3//constraints:xenial", + "@bazel_toolchains_bazel3//constraints/sanitizers:support_msan", + ], + remote_execution_properties = """ + properties: { + name: "container-image" + value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:b516a2d69537cb40a7c6a7d92d0008abb29fba8725243772bdaf2c83f1be2272" + } + properties: { + name: "dockerAddCapabilities" + value: "SYS_ADMIN" + } + properties: { + name: "dockerPrivileged" + value: "true" + } + """, +) + +rbe_toolchain( + name = "cc-toolchain-clang-x86_64-default_bazel3", + exec_compatible_with = [], + tags = [ + "manual", + ], + target_compatible_with = [], + toolchain = "@bazel_toolchains_bazel3//configs/ubuntu16_04_clang/11.0.0/bazel_3.1.0/cc:cc-compiler-k8", + toolchain_type = "@bazel_tools//tools/cpp:toolchain_type", +) diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD index 32a949c93..558826bf1 100644 --- a/tools/go_generics/BUILD +++ b/tools/go_generics/BUILD @@ -12,27 +12,3 @@ go_binary( visibility = ["//:sandbox"], deps = ["//tools/go_generics/globals"], ) - -genrule( - name = "go_generics_tests", - srcs = glob(["generics_tests/**"]) + [":go_generics"], - outs = ["go_generics_tests.tgz"], - cmd = "tar -czvhf $@ $(SRCS)", -) - -genrule( - name = "go_generics_test_bundle", - srcs = [ - ":go_generics_tests.tgz", - ":go_generics_unittest.sh", - ], - outs = ["go_generics_test.sh"], - cmd = "cat $(location :go_generics_unittest.sh) $(location :go_generics_tests.tgz) > $@", - executable = True, -) - -sh_test( - name = "go_generics_test", - size = "small", - srcs = ["go_generics_test.sh"], -) diff --git a/tools/go_generics/defs.bzl b/tools/go_generics/defs.bzl index ec047a644..33329cf28 100644 --- a/tools/go_generics/defs.bzl +++ b/tools/go_generics/defs.bzl @@ -100,20 +100,21 @@ def _go_template_instance_impl(ctx): # Build the argument list. args = ["-i=%s" % template.file.path, "-o=%s" % output.path] - args += ["-p=%s" % ctx.attr.package] + if ctx.attr.package: + args.append("-p=%s" % ctx.attr.package) if len(ctx.attr.prefix) > 0: - args += ["-prefix=%s" % ctx.attr.prefix] + args.append("-prefix=%s" % ctx.attr.prefix) if len(ctx.attr.suffix) > 0: - args += ["-suffix=%s" % ctx.attr.suffix] + args.append("-suffix=%s" % ctx.attr.suffix) args += [("-t=%s=%s" % (p[0], p[1])) for p in ctx.attr.types.items()] args += [("-c=%s=%s" % (p[0], p[1])) for p in ctx.attr.consts.items()] args += [("-import=%s=%s" % (p[0], p[1])) for p in ctx.attr.imports.items()] if ctx.attr.anon: - args += ["-anon"] + args.append("-anon") ctx.actions.run( inputs = [template.file], @@ -151,7 +152,7 @@ go_template_instance = rule( "consts": attr.string_dict(), "imports": attr.string_dict(), "anon": attr.bool(mandatory = False, default = False), - "package": attr.string(mandatory = True), + "package": attr.string(mandatory = False), "out": attr.output(mandatory = True), "_tool": attr.label(executable = True, cfg = "host", default = Label("//tools/go_generics")), }, diff --git a/tools/go_generics/generics_tests/all_stmts/opts.txt b/tools/go_generics/generics_tests/all_stmts/opts.txt deleted file mode 100644 index c9d0e09bf..000000000 --- a/tools/go_generics/generics_tests/all_stmts/opts.txt +++ /dev/null @@ -1 +0,0 @@ --t=T=Q diff --git a/tools/go_generics/generics_tests/all_types/opts.txt b/tools/go_generics/generics_tests/all_types/opts.txt deleted file mode 100644 index c9d0e09bf..000000000 --- a/tools/go_generics/generics_tests/all_types/opts.txt +++ /dev/null @@ -1 +0,0 @@ --t=T=Q diff --git a/tools/go_generics/generics_tests/anon/opts.txt b/tools/go_generics/generics_tests/anon/opts.txt deleted file mode 100644 index a5e9d26de..000000000 --- a/tools/go_generics/generics_tests/anon/opts.txt +++ /dev/null @@ -1 +0,0 @@ --t=T=Q -suffix=New -anon diff --git a/tools/go_generics/generics_tests/consts/opts.txt b/tools/go_generics/generics_tests/consts/opts.txt deleted file mode 100644 index 4fb59dce8..000000000 --- a/tools/go_generics/generics_tests/consts/opts.txt +++ /dev/null @@ -1 +0,0 @@ --c=c1=20 -c=z=600 -c=v=3.3 -c=s="def" -c=A=20 -c=C=100 -c=S="def" -c=T="ABC" diff --git a/tools/go_generics/generics_tests/imports/opts.txt b/tools/go_generics/generics_tests/imports/opts.txt deleted file mode 100644 index 87324be79..000000000 --- a/tools/go_generics/generics_tests/imports/opts.txt +++ /dev/null @@ -1 +0,0 @@ --t=T=sync.Mutex -c=n=math.Uint32 -c=m=math.Uint64 -import=sync=sync -import=math=mymathpath diff --git a/tools/go_generics/generics_tests/remove_typedef/opts.txt b/tools/go_generics/generics_tests/remove_typedef/opts.txt deleted file mode 100644 index 9c8ecaada..000000000 --- a/tools/go_generics/generics_tests/remove_typedef/opts.txt +++ /dev/null @@ -1 +0,0 @@ --t=T=U diff --git a/tools/go_generics/generics_tests/simple/opts.txt b/tools/go_generics/generics_tests/simple/opts.txt deleted file mode 100644 index 7832ef66f..000000000 --- a/tools/go_generics/generics_tests/simple/opts.txt +++ /dev/null @@ -1 +0,0 @@ --t=T=Q -suffix=New diff --git a/tools/go_generics/go_generics_unittest.sh b/tools/go_generics/go_generics_unittest.sh deleted file mode 100755 index 44b22db91..000000000 --- a/tools/go_generics/go_generics_unittest.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash - -# Copyright 2018 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Bash "safe-mode": Treat command failures as fatal (even those that occur in -# pipes), and treat unset variables as errors. -set -eu -o pipefail - -# This file will be generated as a self-extracting shell script in order to -# eliminate the need for any runtime dependencies. The tarball at the end will -# include the go_generics binary, as well as a subdirectory named -# generics_tests. See the BUILD file for more information. -declare -r temp=$(mktemp -d) -function cleanup() { - rm -rf "${temp}" -} -# trap cleanup EXIT - -# Print message in "$1" then exit with status 1. -function die () { - echo "$1" 1>&2 - exit 1 -} - -# This prints the line number of __BUNDLE__ below, that should be the last line -# of this script. After that point, the concatenated archive will be the -# contents. -declare -r tgz=`awk '/^__BUNDLE__/ {print NR + 1; exit 0; }' $0` -tail -n+"${tgz}" $0 | tar -xzv -C "${temp}" - -# The target for the test. -declare -r binary="$(find ${temp} -type f -a -name go_generics)" -declare -r input_dirs="$(find ${temp} -type d -a -name generics_tests)/*" - -# Go through all test cases. -for f in ${input_dirs}; do - base=$(basename "${f}") - - # Run go_generics on the input file. - opts=$(head -n 1 ${f}/opts.txt) - out="${f}/output/generated.go" - expected="${f}/output/output.go" - ${binary} ${opts} "-i=${f}/input.go" "-o=${out}" || die "go_generics failed for test case \"${base}\"" - - # Compare the outputs. - diff ${expected} ${out} - if [ $? -ne 0 ]; then - echo "Expected:" - cat ${expected} - echo "Actual:" - cat ${out} - die "Actual output is different from expected for test \"${base}\"" - fi -done - -echo "PASS" -exit 0 -__BUNDLE__ diff --git a/tools/go_generics/tests/BUILD b/tools/go_generics/tests/BUILD new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/tools/go_generics/tests/BUILD diff --git a/tools/go_generics/tests/all_stmts/BUILD b/tools/go_generics/tests/all_stmts/BUILD new file mode 100644 index 000000000..a4a7c775a --- /dev/null +++ b/tools/go_generics/tests/all_stmts/BUILD @@ -0,0 +1,16 @@ +load("//tools/go_generics/tests:defs.bzl", "go_generics_test") + +go_generics_test( + name = "all_stmts", + inputs = ["input.go"], + output = "output.go", + types = { + "T": "Q", + }, +) + +# @unused +glaze_ignore = [ + "input.go", + "output.go", +] diff --git a/tools/go_generics/generics_tests/all_stmts/input.go b/tools/go_generics/tests/all_stmts/input.go index 4791d1ff1..4791d1ff1 100644 --- a/tools/go_generics/generics_tests/all_stmts/input.go +++ b/tools/go_generics/tests/all_stmts/input.go diff --git a/tools/go_generics/generics_tests/all_stmts/output/output.go b/tools/go_generics/tests/all_stmts/output.go index a53d84535..a53d84535 100644 --- a/tools/go_generics/generics_tests/all_stmts/output/output.go +++ b/tools/go_generics/tests/all_stmts/output.go diff --git a/tools/go_generics/tests/all_types/BUILD b/tools/go_generics/tests/all_types/BUILD new file mode 100644 index 000000000..60b1fd314 --- /dev/null +++ b/tools/go_generics/tests/all_types/BUILD @@ -0,0 +1,16 @@ +load("//tools/go_generics/tests:defs.bzl", "go_generics_test") + +go_generics_test( + name = "all_types", + inputs = ["input.go"], + output = "output.go", + types = { + "T": "Q", + }, +) + +# @unused +glaze_ignore = [ + "input.go", + "output.go", +] diff --git a/tools/go_generics/generics_tests/all_types/input.go b/tools/go_generics/tests/all_types/input.go index 3575d02ec..6f85bbb69 100644 --- a/tools/go_generics/generics_tests/all_types/input.go +++ b/tools/go_generics/tests/all_types/input.go @@ -14,7 +14,9 @@ package tests -import "./lib" +import ( + "./lib" +) type T int diff --git a/tools/go_generics/generics_tests/all_types/lib/lib.go b/tools/go_generics/tests/all_types/lib/lib.go index 988786496..988786496 100644 --- a/tools/go_generics/generics_tests/all_types/lib/lib.go +++ b/tools/go_generics/tests/all_types/lib/lib.go diff --git a/tools/go_generics/generics_tests/all_types/output/output.go b/tools/go_generics/tests/all_types/output.go index 41fd147a1..c0bbebfe7 100644 --- a/tools/go_generics/generics_tests/all_types/output/output.go +++ b/tools/go_generics/tests/all_types/output.go @@ -14,7 +14,9 @@ package main -import "./lib" +import ( + "./lib" +) type newType struct { a Q diff --git a/tools/go_generics/tests/anon/BUILD b/tools/go_generics/tests/anon/BUILD new file mode 100644 index 000000000..ef24f4b25 --- /dev/null +++ b/tools/go_generics/tests/anon/BUILD @@ -0,0 +1,18 @@ +load("//tools/go_generics/tests:defs.bzl", "go_generics_test") + +go_generics_test( + name = "anon", + anon = True, + inputs = ["input.go"], + output = "output.go", + suffix = "New", + types = { + "T": "Q", + }, +) + +# @unused +glaze_ignore = [ + "input.go", + "output.go", +] diff --git a/tools/go_generics/generics_tests/anon/input.go b/tools/go_generics/tests/anon/input.go index 44086d522..44086d522 100644 --- a/tools/go_generics/generics_tests/anon/input.go +++ b/tools/go_generics/tests/anon/input.go diff --git a/tools/go_generics/generics_tests/anon/output/output.go b/tools/go_generics/tests/anon/output.go index 160cddf79..7fa791853 100644 --- a/tools/go_generics/generics_tests/anon/output/output.go +++ b/tools/go_generics/tests/anon/output.go @@ -35,8 +35,8 @@ func (f FooNew) GetBar(name string) Q { func foobarNew() { a := BazNew{} - a.Q = 0 // should not be renamed, this is a limitation + a.Q = 0 b := otherpkg.UnrelatedType{} - b.Q = 0 // should not be renamed, this is a limitation + b.Q = 0 } diff --git a/tools/go_generics/tests/consts/BUILD b/tools/go_generics/tests/consts/BUILD new file mode 100644 index 000000000..fd7caccad --- /dev/null +++ b/tools/go_generics/tests/consts/BUILD @@ -0,0 +1,23 @@ +load("//tools/go_generics/tests:defs.bzl", "go_generics_test") + +go_generics_test( + name = "consts", + consts = { + "c1": "20", + "z": "600", + "v": "3.3", + "s": "\"def\"", + "A": "20", + "C": "100", + "S": "\"def\"", + "T": "\"ABC\"", + }, + inputs = ["input.go"], + output = "output.go", +) + +# @unused +glaze_ignore = [ + "input.go", + "output.go", +] diff --git a/tools/go_generics/generics_tests/consts/input.go b/tools/go_generics/tests/consts/input.go index 04b95fcc6..04b95fcc6 100644 --- a/tools/go_generics/generics_tests/consts/input.go +++ b/tools/go_generics/tests/consts/input.go diff --git a/tools/go_generics/generics_tests/consts/output/output.go b/tools/go_generics/tests/consts/output.go index 18d316cc9..18d316cc9 100644 --- a/tools/go_generics/generics_tests/consts/output/output.go +++ b/tools/go_generics/tests/consts/output.go diff --git a/tools/go_generics/tests/defs.bzl b/tools/go_generics/tests/defs.bzl new file mode 100644 index 000000000..6277c3947 --- /dev/null +++ b/tools/go_generics/tests/defs.bzl @@ -0,0 +1,67 @@ +"""Generics tests.""" + +load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") + +def _go_generics_test_impl(ctx): + runner = ctx.actions.declare_file(ctx.label.name) + runner_content = "\n".join([ + "#!/bin/bash", + "exec diff --ignore-blank-lines --ignore-matching-lines=^[[:space:]]*// %s %s" % ( + ctx.files.template_output[0].short_path, + ctx.files.expected_output[0].short_path, + ), + "", + ]) + ctx.actions.write(runner, runner_content, is_executable = True) + return [DefaultInfo( + executable = runner, + runfiles = ctx.runfiles( + files = ctx.files.template_output + ctx.files.expected_output, + collect_default = True, + collect_data = True, + ), + )] + +_go_generics_test = rule( + implementation = _go_generics_test_impl, + attrs = { + "template_output": attr.label(mandatory = True, allow_single_file = True), + "expected_output": attr.label(mandatory = True, allow_single_file = True), + }, + test = True, +) + +def go_generics_test(name, inputs, output, types = None, consts = None, **kwargs): + """Instantiates a generics test. + + Args: + name: the name of the test. + inputs: all the input files. + output: the output files. + types: the template types (dictionary). + consts: the template consts (dictionary). + **kwargs: additional arguments for the template_instance. + """ + if types == None: + types = dict() + if consts == None: + consts = dict() + go_template( + name = name + "_template", + srcs = inputs, + types = types.keys(), + consts = consts.keys(), + ) + go_template_instance( + name = name + "_output", + template = ":" + name + "_template", + out = name + "_output.go", + types = types, + consts = consts, + **kwargs + ) + _go_generics_test( + name = name + "_test", + template_output = name + "_output.go", + expected_output = output, + ) diff --git a/tools/go_generics/tests/imports/BUILD b/tools/go_generics/tests/imports/BUILD new file mode 100644 index 000000000..a86223d41 --- /dev/null +++ b/tools/go_generics/tests/imports/BUILD @@ -0,0 +1,24 @@ +load("//tools/go_generics/tests:defs.bzl", "go_generics_test") + +go_generics_test( + name = "imports", + consts = { + "n": "math.Uint32", + "m": "math.Uint64", + }, + imports = { + "sync": "sync", + "math": "mymathpath", + }, + inputs = ["input.go"], + output = "output.go", + types = { + "T": "sync.Mutex", + }, +) + +# @unused +glaze_ignore = [ + "input.go", + "output.go", +] diff --git a/tools/go_generics/generics_tests/imports/input.go b/tools/go_generics/tests/imports/input.go index 0f032c2a1..0f032c2a1 100644 --- a/tools/go_generics/generics_tests/imports/input.go +++ b/tools/go_generics/tests/imports/input.go diff --git a/tools/go_generics/generics_tests/imports/output/output.go b/tools/go_generics/tests/imports/output.go index 2488ca58c..2488ca58c 100644 --- a/tools/go_generics/generics_tests/imports/output/output.go +++ b/tools/go_generics/tests/imports/output.go diff --git a/tools/go_generics/tests/remove_typedef/BUILD b/tools/go_generics/tests/remove_typedef/BUILD new file mode 100644 index 000000000..46457cec6 --- /dev/null +++ b/tools/go_generics/tests/remove_typedef/BUILD @@ -0,0 +1,16 @@ +load("//tools/go_generics/tests:defs.bzl", "go_generics_test") + +go_generics_test( + name = "remove_typedef", + inputs = ["input.go"], + output = "output.go", + types = { + "T": "U", + }, +) + +# @unused +glaze_ignore = [ + "input.go", + "output.go", +] diff --git a/tools/go_generics/generics_tests/remove_typedef/input.go b/tools/go_generics/tests/remove_typedef/input.go index cf632bae7..cf632bae7 100644 --- a/tools/go_generics/generics_tests/remove_typedef/input.go +++ b/tools/go_generics/tests/remove_typedef/input.go diff --git a/tools/go_generics/generics_tests/remove_typedef/output/output.go b/tools/go_generics/tests/remove_typedef/output.go index d44fd8e1c..d44fd8e1c 100644 --- a/tools/go_generics/generics_tests/remove_typedef/output/output.go +++ b/tools/go_generics/tests/remove_typedef/output.go diff --git a/tools/go_generics/tests/simple/BUILD b/tools/go_generics/tests/simple/BUILD new file mode 100644 index 000000000..4b9265ea4 --- /dev/null +++ b/tools/go_generics/tests/simple/BUILD @@ -0,0 +1,17 @@ +load("//tools/go_generics/tests:defs.bzl", "go_generics_test") + +go_generics_test( + name = "simple", + inputs = ["input.go"], + output = "output.go", + suffix = "New", + types = { + "T": "Q", + }, +) + +# @unused +glaze_ignore = [ + "input.go", + "output.go", +] diff --git a/tools/go_generics/generics_tests/simple/input.go b/tools/go_generics/tests/simple/input.go index 2a917f16c..2a917f16c 100644 --- a/tools/go_generics/generics_tests/simple/input.go +++ b/tools/go_generics/tests/simple/input.go diff --git a/tools/go_generics/generics_tests/simple/output/output.go b/tools/go_generics/tests/simple/output.go index 6bfa0b25b..6bfa0b25b 100644 --- a/tools/go_generics/generics_tests/simple/output/output.go +++ b/tools/go_generics/tests/simple/output.go diff --git a/tools/go_marshal/README.md b/tools/go_marshal/README.md index 4886efddf..68d759083 100644 --- a/tools/go_marshal/README.md +++ b/tools/go_marshal/README.md @@ -9,11 +9,9 @@ automatically generating code to marshal go data structures to memory. `binary.Marshal` by moving the go runtime reflection necessary to marshal a struct to compile-time. -`go_marshal` automatically generates implementations for `abi.Marshallable` and -`safemem.{Reader,Writer}`. Call-sites for serialization (typically syscall -implementations) can directly invoke `safemem.Reader.ReadToBlocks` and -`safemem.Writer.WriteFromBlocks`. Data structures that require custom -serialization will have manual implementations for these interfaces. +`go_marshal` automatically generates implementations for `marshal.Marshallable` +and `safemem.{Reader,Writer}`. Data structures that require custom serialization +will have manual implementations for these interfaces. Data structures can be flagged for code generation by adding a struct-level comment `// +marshal`. diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go index 177013dbb..19bcd4e6a 100644 --- a/tools/go_marshal/gomarshal/generator.go +++ b/tools/go_marshal/gomarshal/generator.go @@ -413,13 +413,13 @@ func (g *Generator) Run() error { for _, t := range g.collectMarshallableTypes(a, fsets[i]) { impl := g.generateOne(t, fsets[i]) // Collect Marshallable types referenced by the generated code. - for ref, _ := range impl.ms { + for ref := range impl.ms { ms[ref] = struct{}{} } impls = append(impls, impl) // Collect imports referenced by the generated code and add them to // the list of imports we need to copy to the generated code. - for name, _ := range impl.is { + for name := range impl.is { if !g.imports.markUsed(name) { panic(fmt.Sprintf("Generated code for '%s' referenced a non-existent import with local name '%s'. Either go-marshal needs to add an import to the generated file, or a package in an input source file has a package name differ from the final component of its path, which go-marshal doesn't know how to detect; use an import alias to work around this limitation.", impl.typeName(), name)) } diff --git a/tools/go_marshal/gomarshal/generator_interfaces_struct.go b/tools/go_marshal/gomarshal/generator_interfaces_struct.go index 9cd3c9579..4b9cea08a 100644 --- a/tools/go_marshal/gomarshal/generator_interfaces_struct.go +++ b/tools/go_marshal/gomarshal/generator_interfaces_struct.go @@ -268,6 +268,10 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n") g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName()) g.inIndent(func() { + fallback := func() { + g.emit("// Type %s doesn't have a packed layout in memory, fallback to MarshalBytes.\n", g.typeName()) + g.emit("%s.MarshalBytes(dst)\n", g.r) + } if thisPacked { g.recordUsedImport("safecopy") g.recordUsedImport("unsafe") @@ -277,16 +281,13 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r) }) g.emit("} else {\n") - g.inIndent(func() { - g.emit("%s.MarshalBytes(dst)\n", g.r) - }) + g.inIndent(fallback) g.emit("}\n") } else { g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r) } } else { - g.emit("// Type %s doesn't have a packed layout in memory, fallback to MarshalBytes.\n", g.typeName()) - g.emit("%s.MarshalBytes(dst)\n", g.r) + fallback() } }) g.emit("}\n\n") @@ -294,6 +295,10 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n") g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName()) g.inIndent(func() { + fallback := func() { + g.emit("// Type %s doesn't have a packed layout in memory, fallback to UnmarshalBytes.\n", g.typeName()) + g.emit("%s.UnmarshalBytes(src)\n", g.r) + } if thisPacked { g.recordUsedImport("safecopy") g.recordUsedImport("unsafe") @@ -303,16 +308,13 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r) }) g.emit("} else {\n") - g.inIndent(func() { - g.emit("%s.UnmarshalBytes(src)\n", g.r) - }) + g.inIndent(fallback) g.emit("}\n") } else { g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r) } } else { - g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName()) - g.emit("%s.UnmarshalBytes(src)\n", g.r) + fallback() } }) g.emit("}\n\n") @@ -463,8 +465,10 @@ func (g *interfaceGenerator) emitMarshallableSliceForStruct(st *ast.StructType, }) g.emit("}\n\n") - g.emit("// Handle any final partial object.\n") - g.emit("if length < size*count && length%size != 0 {\n") + g.emit("// Handle any final partial object. buf is guaranteed to be long enough for the\n") + g.emit("// final element, but may not contain valid data for the entire range. This may\n") + g.emit("// result in unmarshalling zero values for some parts of the object.\n") + g.emit("if length%size != 0 {\n") g.inIndent(func() { g.emit("idx := limit\n") g.emit("dst[idx].UnmarshalBytes(buf[size*idx:size*(idx+1)])\n") diff --git a/tools/go_marshal/marshal/marshal.go b/tools/go_marshal/marshal/marshal.go index cb2166252..85b196f08 100644 --- a/tools/go_marshal/marshal/marshal.go +++ b/tools/go_marshal/marshal/marshal.go @@ -58,18 +58,12 @@ type Marshallable interface { // likely make use of the type of these fields). SizeBytes() int - // MarshalBytes serializes a copy of a type to dst. dst may be smaller than - // SizeBytes(), which results in a part of the struct being marshalled. Note - // that this may have unexpected results for non-packed types, as implicit - // padding needs to be taken into account when reasoning about how much of - // the type is serialized. + // MarshalBytes serializes a copy of a type to dst. + // Precondition: dst must be at least SizeBytes() in length. MarshalBytes(dst []byte) - // UnmarshalBytes deserializes a type from src. src may be smaller than - // SizeBytes(), which results in a partially deserialized struct. Note that - // this may have unexpected results for non-packed types, as implicit - // padding needs to be taken into account when reasoning about how much of - // the type is deserialized. + // UnmarshalBytes deserializes a type from src. + // Precondition: src must be at least SizeBytes() in length. UnmarshalBytes(src []byte) // Packed returns true if the marshalled size of the type is the same as the @@ -89,8 +83,8 @@ type Marshallable interface { // representation to the dst buffer. This is only safe to do when the type // has no implicit padding, see Marshallable.Packed. When Packed would // return false, MarshalUnsafe should fall back to the safer but slower - // MarshalBytes. dst may be smaller than SizeBytes(), see comment for - // MarshalBytes for implications. + // MarshalBytes. + // Precondition: dst must be at least SizeBytes() in length. MarshalUnsafe(dst []byte) // UnmarshalUnsafe deserializes a type by directly copying to the underlying @@ -99,8 +93,8 @@ type Marshallable interface { // This allows much faster unmarshalling of types which have no implicit // padding, see Marshallable.Packed. When Packed would return false, // UnmarshalUnsafe should fall back to the safer but slower unmarshal - // mechanism implemented in UnmarshalBytes. src may be smaller than - // SizeBytes(), see comment for UnmarshalBytes for implications. + // mechanism implemented in UnmarshalBytes. + // Precondition: src must be at least SizeBytes() in length. UnmarshalUnsafe(src []byte) // CopyIn deserializes a Marshallable type from a task's memory. This may @@ -149,14 +143,16 @@ type Marshallable interface { // // Generates four additional functions for marshalling slices of Foos like this: // -// // MarshalUnsafeFooSlice is like Foo.MarshalUnsafe, buf for a []Foo. It's -// // more efficient that repeatedly calling calling Foo.MarshalUnsafe over a -// // []Foo in a loop. +// // MarshalUnsafeFooSlice is like Foo.MarshalUnsafe, buf for a []Foo. It +// // might be more efficient that repeatedly calling Foo.MarshalUnsafe +// // over a []Foo in a loop if the type is Packed. +// // Preconditions: dst must be at least len(src)*Foo.SizeBytes() in length. // func MarshalUnsafeFooSlice(src []Foo, dst []byte) (int, error) { ... } // -// // UnmarshalUnsafeFooSlice is like Foo.UnmarshalUnsafe, buf for a []Foo. It's -// // more efficient that repeatedly calling calling Foo.UnmarshalUnsafe over a -// // []Foo in a loop. +// // UnmarshalUnsafeFooSlice is like Foo.UnmarshalUnsafe, buf for a []Foo. It +// // might be more efficient that repeatedly calling Foo.UnmarshalUnsafe +// // over a []Foo in a loop if the type is Packed. +// // Preconditions: src must be at least len(dst)*Foo.SizeBytes() in length. // func UnmarshalUnsafeFooSlice(dst []Foo, src []byte) (int, error) { ... } // // // CopyFooSliceIn copies in a slice of Foo objects from the task's memory. diff --git a/tools/go_marshal/primitive/primitive.go b/tools/go_marshal/primitive/primitive.go index ebcf130ae..d93edda8b 100644 --- a/tools/go_marshal/primitive/primitive.go +++ b/tools/go_marshal/primitive/primitive.go @@ -17,10 +17,22 @@ package primitive import ( + "io" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/tools/go_marshal/marshal" ) +// Int8 is a marshal.Marshallable implementation for int8. +// +// +marshal slice:Int8Slice:inner +type Int8 int8 + +// Uint8 is a marshal.Marshallable implementation for uint8. +// +// +marshal slice:Uint8Slice:inner +type Uint8 uint8 + // Int16 is a marshal.Marshallable implementation for int16. // // +marshal slice:Int16Slice:inner @@ -51,6 +63,66 @@ type Int64 int64 // +marshal slice:Uint64Slice:inner type Uint64 uint64 +// ByteSlice is a marshal.Marshallable implementation for []byte. +// This is a convenience wrapper around a dynamically sized type, and can't be +// embedded in other marshallable types because it breaks assumptions made by +// go-marshal internals. It violates the "no dynamically-sized types" +// constraint of the go-marshal library. +type ByteSlice []byte + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (b *ByteSlice) SizeBytes() int { + return len(*b) +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (b *ByteSlice) MarshalBytes(dst []byte) { + copy(dst, *b) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (b *ByteSlice) UnmarshalBytes(src []byte) { + copy(*b, src) +} + +// Packed implements marshal.Marshallable.Packed. +func (b *ByteSlice) Packed() bool { + return false +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (b *ByteSlice) MarshalUnsafe(dst []byte) { + b.MarshalBytes(dst) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (b *ByteSlice) UnmarshalUnsafe(src []byte) { + b.UnmarshalBytes(src) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +func (b *ByteSlice) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + return task.CopyInBytes(addr, *b) +} + +// CopyOut implements marshal.Marshallable.CopyOut. +func (b *ByteSlice) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return task.CopyOutBytes(addr, *b) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +func (b *ByteSlice) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + return task.CopyOutBytes(addr, (*b)[:limit]) +} + +// WriteTo implements io.WriterTo.WriteTo. +func (b *ByteSlice) WriteTo(w io.Writer) (int64, error) { + n, err := w.Write(*b) + return int64(n), err +} + +var _ marshal.Marshallable = (*ByteSlice)(nil) + // Below, we define some convenience functions for marshalling primitive types // using the newtypes above, without requiring superfluous casts. diff --git a/tools/nogo/build.go b/tools/nogo/build.go index fb9f17b62..433d13738 100644 --- a/tools/nogo/build.go +++ b/tools/nogo/build.go @@ -31,10 +31,10 @@ var ( ) // findStdPkg needs to find the bundled standard library packages. -func findStdPkg(path, GOOS, GOARCH string) (io.ReadCloser, error) { +func (i *importer) findStdPkg(path string) (io.ReadCloser, error) { if path == "C" { // Cgo builds cannot be analyzed. Skip. return nil, ErrSkip } - return os.Open(fmt.Sprintf("external/go_sdk/pkg/%s_%s/%s.a", GOOS, GOARCH, path)) + return os.Open(fmt.Sprintf("external/go_sdk/pkg/%s_%s/%s.a", i.GOOS, i.GOARCH, path)) } diff --git a/tools/nogo/defs.bzl b/tools/nogo/defs.bzl index 6560b57c8..d399079c5 100644 --- a/tools/nogo/defs.bzl +++ b/tools/nogo/defs.bzl @@ -28,8 +28,10 @@ def _nogo_aspect_impl(target, ctx): else: return [NogoInfo()] - # Construct the Go environment from the go_context.env dictionary. - env_prefix = " ".join(["%s=%s" % (key, value) for (key, value) in go_context(ctx).env.items()]) + go_ctx = go_context(ctx) + + # Construct the Go environment from the go_ctx.env dictionary. + env_prefix = " ".join(["%s=%s" % (key, value) for (key, value) in go_ctx.env.items()]) # Start with all target files and srcs as input. inputs = target.files.to_list() + srcs @@ -45,7 +47,7 @@ def _nogo_aspect_impl(target, ctx): "#!/bin/bash", "%s %s tool objdump %s > %s\n" % ( env_prefix, - go_context(ctx).go.path, + go_ctx.go.path, [f.path for f in binaries if f.path.endswith(".a")][0], disasm_file.path, ), @@ -53,7 +55,7 @@ def _nogo_aspect_impl(target, ctx): ctx.actions.run( inputs = binaries, outputs = [disasm_file], - tools = go_context(ctx).runfiles, + tools = go_ctx.runfiles, mnemonic = "GoObjdump", progress_message = "Objdump %s" % target.label, executable = dumper, @@ -70,9 +72,11 @@ def _nogo_aspect_impl(target, ctx): ImportPath = importpath, GoFiles = [src.path for src in srcs if src.path.endswith(".go")], NonGoFiles = [src.path for src in srcs if not src.path.endswith(".go")], - GOOS = go_context(ctx).goos, - GOARCH = go_context(ctx).goarch, - Tags = go_context(ctx).tags, + # Google's internal build system needs a bit more help to find std. + StdZip = go_ctx.std_zip.short_path if hasattr(go_ctx, "std_zip") else "", + GOOS = go_ctx.goos, + GOARCH = go_ctx.goarch, + Tags = go_ctx.tags, FactMap = {}, # Constructed below. ImportMap = {}, # Constructed below. FactOutput = facts.path, @@ -110,7 +114,7 @@ def _nogo_aspect_impl(target, ctx): ctx.actions.run( inputs = inputs, outputs = [facts], - tools = go_context(ctx).runfiles, + tools = go_ctx.runfiles, executable = ctx.files._nogo[0], mnemonic = "GoStaticAnalysis", progress_message = "Analyzing %s" % target.label, diff --git a/tools/nogo/nogo.go b/tools/nogo/nogo.go index 5ee586c3e..ea1e97076 100644 --- a/tools/nogo/nogo.go +++ b/tools/nogo/nogo.go @@ -55,6 +55,7 @@ type pkgConfig struct { FactMap map[string]string FactOutput string Objdump string + StdZip string } // loadFacts finds and loads facts per FactMap. @@ -111,7 +112,7 @@ func (i *importer) Import(path string) (*types.Package, error) { if !ok { // Not found in the import path. Attempt to find the package // via the standard library. - rc, err = findStdPkg(path, i.GOOS, i.GOARCH) + rc, err = i.findStdPkg(path) } else { // Open the file. rc, err = os.Open(realPath) diff --git a/tools/vm/ubuntu1604/30_docker.sh b/tools/vm/ubuntu1604/30_docker.sh index 332a03dfb..d393133e4 100755 --- a/tools/vm/ubuntu1604/30_docker.sh +++ b/tools/vm/ubuntu1604/30_docker.sh @@ -54,8 +54,11 @@ while true; do done # Enable experimental features, for cross-building aarch64 images. +# Enable Docker IPv6. cat > /etc/docker/daemon.json <<EOF { - "experimental": true + "experimental": true, + "fixed-cidr-v6": "2001:db8:1::/64", + "ipv6": true } EOF |